'''Document bases.'''
import abc
[docs]class VeryFalseType(object):
def __bool__(self):
return False
VeryFalse = VeryFalseType()
'''Document is not definitely supported.'''
[docs]class BaseDocumentDetector(object, metaclass=abc.ABCMeta):
'''Base class for classes that detect document types.'''
@classmethod
[docs] def is_supported(cls, file=None, request=None, response=None,
url_info=None):
'''Given the hints, return whether the document is supported.
Args:
file: A file object containing the document.
request (:class:`.http.request.Request`): An HTTP request.
response (:class:`.http.request.Response`): An HTTP response.
url_info (:class:`.url.URLInfo`): A URLInfo.
Returns:
bool: If True, the reader should be able to read it.
'''
tests = (
(response, cls.is_response),
(file, cls.is_file),
(request, cls.is_request),
(url_info, cls.is_url)
)
for instance, method in tests:
if instance:
try:
result = method(instance)
except NotImplementedError:
pass
else:
if result:
return True
elif result is VeryFalse:
return VeryFalse
@classmethod
[docs] def is_file(cls, file):
'''Return whether the reader is likely able to read the file.
Args:
file: A file object containing the document.
Returns:
bool
'''
raise NotImplementedError() # optional override
@classmethod
[docs] def is_request(cls, request):
'''Return whether the request is likely supported.
Args:
request (:class:`.http.request.Request`): An HTTP request.
Returns:
bool
'''
raise NotImplementedError() # optional override
@classmethod
[docs] def is_response(cls, response):
'''Return whether the response is likely able to be read.
Args:
response (:class:`.http.request.Response`): An HTTP response.
Returns:
bool
'''
raise NotImplementedError() # optional override
@classmethod
[docs] def is_url(cls, url_info):
'''Return whether the URL is likely to be supported.
Args:
url_info (:class:`.url.URLInfo`): A URLInfo.
Returns:
bool
'''
raise NotImplementedError() # optional override
[docs]class BaseTextStreamReader(object, metaclass=abc.ABCMeta):
'''Base class for document readers that filters link and non-link text.'''
@abc.abstractmethod
[docs] def iter_text(self, file, encoding=None):
'''Return the file text and links.
Args:
file: A file object containing the document.
encoding (str): The encoding of the document.
Returns:
iterator: Each item is a tuple:
1. str: The text
2. bool (or truthy value): Whether the text is a likely a link.
If truthy value may be provided containing additional context
of the link.
The links returned are raw text and will require further processing.
'''
[docs] def iter_links(self, file, encoding=None, context=False):
'''Return the links.
This function is a convenience function for calling :meth:`iter_text`
and returning only the links.
'''
if context:
return [item for item in self.iter_text(file, encoding) if item[1]]
else:
return [item[0] for item in self.iter_text(file, encoding) if item[1]]
[docs]class BaseHTMLReader(object, metaclass=abc.ABCMeta):
'''Base class for document readers for handling SGML-like documents.'''
@abc.abstractmethod
[docs] def iter_elements(self, file, encoding=None):
'''Return an iterator of elements found in the document.
Args:
file: A file object containing the document.
encoding (str): The encoding of the document.
Returns:
iterator: Each item is an element from
:mod:`.document.htmlparse.element`
'''
pass