Source code for galaxy.datatypes.dataproviders.base

"""
Base class(es) for all DataProviders.
"""

# there's a blurry line between functionality here and functionality in datatypes module
# attempting to keep parsing to a minimum here and focus on chopping/pagination/reformat(/filtering-maybe?)
#   and using as much pre-computed info/metadata from the datatypes module as possible
# also, this shouldn't be a replacement/re-implementation of the tool layer
#   (which provides traceability/versioning/reproducibility)

import logging
from collections import deque

from . import exceptions

log = logging.getLogger(__name__)

_TODO = """
hooks into datatypes (define providers inside datatype modules) as factories
capture tell() when provider is done
    def stop( self ): self.endpoint = source.tell(); raise StopIteration()
implement __len__ sensibly where it can be (would be good to have where we're giving some progress - '100 of 300')
    seems like sniffed files would have this info
unit tests
add datum entry/exit point methods: possibly decode, encode
    or create a class that pipes source through - how would decode work then?

icorporate existing visualization/dataproviders
some of the sources (esp. in datasets) don't need to be re-created
YAGNI: InterleavingMultiSourceDataProvider, CombiningMultiSourceDataProvider

datasets API entry point:
    kwargs should be parsed from strings 2 layers up (in the DatasetsAPI) - that's the 'proper' place for that.
    but how would it know how/what to parse if it doesn't have access to the classes used in the provider?
        Building a giant list by sweeping all possible dprov classes doesn't make sense
    For now - I'm burying them in the class __init__s - but I don't like that
"""
MAX_LIMIT = 10000


# ----------------------------------------------------------------------------- base classes

[docs]
class HasSettings(type):
    """
    Metaclass for data providers that allows defining and inheriting
    a dictionary named 'settings'.

    Useful for allowing class level access to expected variable types
    passed to class `__init__` functions so they can be parsed from a query string.
    """

    # yeah - this is all too acrobatic
    def __new__(cls, name, base_classes, attributes):
        settings = {}
        # get settings defined in base classes
        for base_class in base_classes:
            base_settings = getattr(base_class, "settings", None)
            if base_settings:
                settings.update(base_settings)
        # get settings defined in this class
        if new_settings := attributes.pop("settings", None):
            settings.update(new_settings)
        attributes["settings"] = settings
        return type.__new__(cls, name, base_classes, attributes)



# ----------------------------------------------------------------------------- base classes

[docs]
class DataProvider(metaclass=HasSettings):
    """
    Base class for all data providers. Data providers:

    - have a source (which must be another file-like object)
    - implement both the iterator and context manager interfaces
    - do not allow write methods (but otherwise implement the other file object interface methods)

    """

    # a definition of expected types for keyword arguments sent to __init__
    #   useful for controlling how query string dictionaries can be parsed into correct types for __init__
    #   empty in this base class
    settings: dict[str, str] = {}


[docs]
    def __init__(self, source, **kwargs):
        """Sets up a data provider, validates supplied source.

        :param source: the source that this iterator will loop over.
                       (Should implement the iterable interface and ideally have the
                       context manager interface as well)

        """
        self.source = self.validate_source(source)



[docs]
    def validate_source(self, source):
        """
        Is this a valid source for this provider?

        :raises InvalidDataProviderSource: if the source is considered invalid.

        Meant to be overridden in subclasses.
        """
        if not source or not hasattr(source, "__iter__"):
            # that's by no means a thorough check
            raise exceptions.InvalidDataProviderSource(source)
        return source


    # TODO: (this might cause problems later...)
    # TODO: some providers (such as chunk's seek and read) rely on this... remove
    def __getattr__(self, name):
        if name == "source":
            # if we're inside this fn, source hasn't been set - provide some safety just for this attr
            return None
        # otherwise, try to get the attr from the source - allows us to get things like provider.encoding, etc.
        if hasattr(self.source, name):
            return getattr(self.source, name)
        # raise the proper error
        return self.__getattribute__(name)

    # write methods should not be allowed

[docs]
    def truncate(self, size):
        raise NotImplementedError("Write methods are purposely disabled")



[docs]
    def write(self, string):
        raise NotImplementedError("Write methods are purposely disabled")



[docs]
    def writelines(self, sequence):
        raise NotImplementedError("Write methods are purposely disabled")


    # TODO: route read methods through next?
    # def readline( self ):
    #    return self.next()

[docs]
    def readlines(self):
        return list(self)


    # iterator interface
    def __iter__(self):
        # it's generators all the way up, Timmy
        with self:
            yield from self.source

    def __next__(self):
        return next(self.source)

    # context manager interface
    def __enter__(self):
        # make the source's context manager interface optional
        if hasattr(self.source, "__enter__"):
            self.source.__enter__()
        return self

    def __exit__(self, *args):
        # make the source's context manager interface optional, call on source if there
        if hasattr(self.source, "__exit__"):
            self.source.__exit__(*args)
        # alternately, call close()
        elif hasattr(self.source, "close"):
            self.source.close()

    def __str__(self):
        """
        String representation for easier debugging.

        Will call `__str__` on its source so this will display piped dataproviders.
        """
        # we need to protect against recursion (in __getattr__) if self.source hasn't been set
        source_str = str(self.source) if hasattr(self, "source") else ""
        return f"{self.__class__.__name__}({str(source_str)})"




[docs]
class FilteredDataProvider(DataProvider):
    """
    Passes each datum through a filter function and yields it if that function
    returns a non-`None` value.

    Also maintains counters:
        - `num_data_read`: how many data have been consumed from the source.
        - `num_valid_data_read`: how many data have been returned from `filter`.
        - `num_data_returned`: how many data has this provider yielded.
    """

    # not useful here - we don't want functions over the query string
    # settings.update({ 'filter_fn': 'function' })


[docs]
    def __init__(self, source, filter_fn=None, **kwargs):
        """
        :param filter_fn: a lambda or function that will be passed a datum and
            return either the (optionally modified) datum or None.
        """
        super().__init__(source, **kwargs)
        self.filter_fn = filter_fn if callable(filter_fn) else None
        # count how many data we got from the source
        self.num_data_read = 0
        # how many valid data have we gotten from the source
        #   IOW, data that's passed the filter and been either provided OR have been skipped due to offset
        self.num_valid_data_read = 0
        # how many lines have been provided/output
        self.num_data_returned = 0


    def __iter__(self):
        parent_gen = super().__iter__()
        for datum in parent_gen:
            self.num_data_read += 1
            datum = self.filter(datum)
            if datum is not None:
                self.num_valid_data_read += 1
                self.num_data_returned += 1
                yield datum

    # TODO: may want to squash this into DataProvider

[docs]
    def filter(self, datum):
        """
        When given a datum from the provider's source, return None if the datum
        'does not pass' the filter or is invalid. Return the datum if it's valid.

        :param datum: the datum to check for validity.
        :returns: the datum, a modified datum, or None

        Meant to be overridden.
        """
        if self.filter_fn:
            return self.filter_fn(datum)
        # also can be overriden entirely
        return datum





[docs]
class LimitedOffsetDataProvider(FilteredDataProvider):
    """
    A provider that uses the counters from FilteredDataProvider to limit the
    number of data and/or skip `offset` number of data before providing.

    Useful for grabbing sections from a source (e.g. pagination).
    """

    # define the expected types of these __init__ arguments so they can be parsed out from query strings
    settings = {"limit": "int", "offset": "int"}

    # TODO: may want to squash this into DataProvider

[docs]
    def __init__(self, source, offset=0, limit=MAX_LIMIT, **kwargs):
        """
        :param offset:  the number of data to skip before providing.
        :param limit:   the final number of data to provide.
        """
        super().__init__(source, **kwargs)

        # how many valid data to skip before we start outputting data - must be positive
        self.offset = offset

        # how many valid data to return - must be positive
        if limit is None:
            limit = MAX_LIMIT
        self.limit = limit


    def __iter__(self):
        """
        Iterate over the source until `num_valid_data_read` is greater than
        `offset`, begin providing datat, and stop when `num_data_returned`
        is greater than `offset`.
        """
        if self.limit is not None and self.limit <= 0:
            return

        parent_gen = super().__iter__()
        for datum in parent_gen:
            self.num_data_returned -= 1

            if self.num_valid_data_read > self.offset:
                self.num_data_returned += 1
                yield datum

            if self.limit is not None and self.num_data_returned >= self.limit:
                break


    # TODO: skipping lines is inefficient - somehow cache file position/line_num pair and allow provider
    #   to seek to a pos/line and then begin providing lines
    # the important catch here is that we need to have accurate pos/line pairs
    #   in order to preserve the functionality of limit and offset
    # if file_seek and len( file_seek ) == 2:
    #    seek_pos, new_line_num = file_seek
    #    self.seek_and_set_curr_line( seek_pos, new_line_num )

    # def seek_and_set_curr_line( self, file_seek, new_curr_line_num ):
    #    self.seek( file_seek, os.SEEK_SET )
    #    self.curr_line_num = new_curr_line_num



[docs]
class MultiSourceDataProvider(DataProvider):
    """
    A provider that iterates over a list of given sources and provides data
    from one after another.

    An iterator over iterators.
    """


[docs]
    def __init__(self, source_list, **kwargs):
        """
        :param source_list: an iterator of iterables
        """
        self.source_list = deque(source_list)


    def __iter__(self):
        """
        Iterate over the source_list, then iterate over the data in each source.

        Skip a given source in `source_list` if it is `None` or invalid.
        """
        for source in self.source_list:
            # just skip falsy sources
            if not source:
                continue
            try:
                self.source = self.validate_source(source)
            except exceptions.InvalidDataProviderSource:
                continue

            parent_gen = super().__iter__()
            yield from parent_gen