'''HTML document readers.'''
import io
from wpull.document.base import BaseHTMLReader, BaseDocumentDetector
import wpull.string
[docs]class HTMLLightParserTarget(object):
'''An HTML parser target for partial elements.
Args:
callback: A callback function. The function should accept the
arguments:
1. `tag` (str): The tag name of the element.
2. `attrib` (dict): The attributes of the element.
3. `text` (str, None): The text of the element.
text_elements: A frozenset of element tag names that we should keep
track of text.
'''
def __init__(self, callback,
text_elements=frozenset(
['style', 'script', 'link', 'url', 'icon'])):
self.callback = callback
self.text_elements = text_elements
self.tag = None
self.attrib = None
self.buffer = None
[docs] def start(self, tag, attrib):
if tag not in self.text_elements:
self.callback(tag, attrib, None)
return
if self.buffer:
self.callback(self.tag, self.attrib, self.buffer.getvalue())
self.tag = tag
self.attrib = attrib
self.buffer = io.StringIO()
[docs] def data(self, data):
if self.buffer:
self.buffer.write(data)
[docs] def end(self, tag):
if self.buffer:
self.callback(self.tag, self.attrib, self.buffer.getvalue())
self.buffer = None
[docs] def close(self):
if self.buffer:
self.callback(self.tag, self.attrib, self.buffer.getvalue())
return True
COMMENT = object()
'''Comment element'''
[docs]class HTMLParserTarget(object):
'''An HTML parser target.
Args:
callback: A callback function. The function should accept the
arguments:
1. `tag` (str): The tag name of the element.
2. `attrib` (dict): The attributes of the element.
3. `text` (str, None): The text of the element.
4. `tail` (str, None): The text after the element.
5. `end` (bool): Whether the tag is and end tag.
'''
def __init__(self, callback):
self.callback = callback
self.tag = None
self.attrib = None
self.buffer = None
self.tail_buffer = None
[docs] def start(self, tag, attrib):
if self.buffer:
self.callback(
self.tag, self.attrib,
self.buffer.getvalue(),
None
)
self.buffer = None
if self.tail_buffer:
self.callback(
self.tag, None,
None,
self.tail_buffer.getvalue(),
True
)
self.tail_buffer = None
self.tag = tag
self.attrib = attrib
self.buffer = io.StringIO()
[docs] def data(self, data):
if self.buffer:
self.buffer.write(data)
if self.tail_buffer:
self.tail_buffer.write(data)
[docs] def end(self, tag):
if self.buffer:
self.callback(
tag, self.attrib,
self.buffer.getvalue(),
None
)
self.buffer = None
if self.tail_buffer:
self.callback(
self.tag, None,
None,
self.tail_buffer.getvalue(),
True
)
self.tail_buffer = None
self.tail_buffer = io.StringIO()
self.tag = tag
[docs] def close(self):
if self.buffer:
self.callback(
self.tag, self.attrib,
self.buffer.getvalue(),
None
)
self.buffer = None
if self.tail_buffer:
self.callback(
self.tag, None,
None,
self.tail_buffer.getvalue(),
True
)
self.tail_buffer = None
return True
[docs]class HTMLReadElement(object):
'''Results from :meth:`HTMLReader.read_links`.
Attributes:
tag (str): The element tag name.
attrib (dict): The element attributes.
text (str, None): The element text.
tail (str, None): The text after the element.
end (bool): Whether the tag is an end tag.
'''
__slots__ = ('tag', 'attrib', 'text', 'tail', 'end')
def __init__(self, tag, attrib, text, tail, end):
self.tag = tag
self.attrib = attrib
self.text = text
self.tail = tail
self.end = end
def __repr__(self):
return 'HTMLReadElement({0}, {1}, {2}, {3}, {4})'.format(
repr(self.tag), repr(self.attrib), repr(self.text),
repr(self.tail), repr(self.end)
)
[docs]class HTMLReader(BaseDocumentDetector, BaseHTMLReader):
'''HTML document reader.
Arguments:
html_parser (:class:`.document.htmlparse.BaseParser`): An HTML parser.
'''
def __init__(self, html_parser):
self._html_parser = html_parser
@classmethod
[docs] def is_response(cls, response):
'''Return whether the Response is likely to be HTML.'''
if 'html' in response.fields.get('content-type', '').lower():
return True
if response.body:
return cls.is_file(response.body)
@classmethod
[docs] def is_request(cls, request):
'''Return whether the Request is likely to be a HTML.'''
return cls.is_url(request.url_info)
@classmethod
[docs] def is_url(cls, url_info):
'''Return whether the URLInfo is likely to be a HTML.'''
path = url_info.path.lower()
if '.htm' in path or '.dhtm' in path or '.xht' in path:
return True
@classmethod
[docs] def is_file(cls, file):
'''Return whether the file is likely to be HTML.'''
peeked_data = wpull.string.printable_bytes(
wpull.util.peek_file(file)).lower()
if b'<!doctype html' in peeked_data \
or b'<head' in peeked_data \
or b'<title' in peeked_data \
or b'<html' in peeked_data \
or b'<script' in peeked_data \
or b'<table' in peeked_data \
or b'<a href' in peeked_data:
return True
[docs] def iter_elements(self, file, encoding=None):
return self._html_parser.parse(file, encoding)