Source code for wpull.document.base

'''Document bases.'''
import abc


[docs]class VeryFalseType(object):
    def __bool__(self):
        return False


VeryFalse = VeryFalseType()
'''Document is not definitely supported.'''


[docs]class BaseDocumentDetector(object, metaclass=abc.ABCMeta):
    '''Base class for classes that detect document types.'''

    @classmethod
[docs]    def is_supported(cls, file=None, request=None, response=None,
                     url_info=None):
        '''Given the hints, return whether the document is supported.

        Args:
            file: A file object containing the document.
            request (:class:`.http.request.Request`): An HTTP request.
            response (:class:`.http.request.Response`): An HTTP response.
            url_info (:class:`.url.URLInfo`): A URLInfo.

        Returns:
            bool: If True, the reader should be able to read it.
        '''
        tests = (
            (response, cls.is_response),
            (file, cls.is_file),
            (request, cls.is_request),
            (url_info, cls.is_url)
        )

        for instance, method in tests:
            if instance:
                try:
                    result = method(instance)
                except NotImplementedError:
                    pass
                else:
                    if result:
                        return True
                    elif result is VeryFalse:
                        return VeryFalse

    @classmethod
[docs]    def is_file(cls, file):
        '''Return whether the reader is likely able to read the file.

        Args:
            file: A file object containing the document.

        Returns:
            bool
        '''
        raise NotImplementedError()  # optional override

    @classmethod
[docs]    def is_request(cls, request):
        '''Return whether the request is likely supported.

        Args:
            request (:class:`.http.request.Request`): An HTTP request.

        Returns:
            bool
        '''
        raise NotImplementedError()  # optional override

    @classmethod
[docs]    def is_response(cls, response):
        '''Return whether the response is likely able to be read.

        Args:
            response (:class:`.http.request.Response`): An HTTP response.

        Returns:
            bool
        '''
        raise NotImplementedError()  # optional override

    @classmethod
[docs]    def is_url(cls, url_info):
        '''Return whether the URL is likely to be supported.

        Args:
            url_info (:class:`.url.URLInfo`): A URLInfo.

        Returns:
            bool
        '''
        raise NotImplementedError()  # optional override


[docs]class BaseTextStreamReader(object, metaclass=abc.ABCMeta):
    '''Base class for document readers that filters link and non-link text.'''
    @abc.abstractmethod
[docs]    def iter_text(self, file, encoding=None):
        '''Return the file text and links.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.

        Returns:
            iterator: Each item is a tuple:

            1. str: The text
            2. bool (or truthy value): Whether the text is a likely a link.
               If truthy value may be provided containing additional context
               of the link.

        The links returned are raw text and will require further processing.
        '''

[docs]    def iter_links(self, file, encoding=None, context=False):
        '''Return the links.

        This function is a convenience function for calling :meth:`iter_text`
        and returning only the links.
        '''
        if context:
            return [item for item in self.iter_text(file, encoding) if item[1]]
        else:
            return [item[0] for item in self.iter_text(file, encoding) if item[1]]


[docs]class BaseExtractiveReader(object, metaclass=abc.ABCMeta):
    '''Base class for document readers that can only extract links.'''
[docs]    def iter_links(self, file, encoding=None):
        '''Return links from file.

        Returns:
            iterator: Each item is a str which represents a link.
        '''


[docs]class BaseHTMLReader(object, metaclass=abc.ABCMeta):
    '''Base class for document readers for handling SGML-like documents.'''

    @abc.abstractmethod
[docs]    def iter_elements(self, file, encoding=None):
        '''Return an iterator of elements found in the document.

        Args:
            file: A file object containing the document.
            encoding (str): The encoding of the document.

        Returns:
            iterator: Each item is an element from
            :mod:`.document.htmlparse.element`
        '''
        pass