Source code for wpull.scraper.util

'''Misc functions.'''

import functools
import gettext
import itertools
import logging
import mimetypes
import re
import string

import wpull.url
from wpull.backport.logging import BraceMessage as __
from wpull.pipeline.item import LinkType

_ = gettext.gettext
_logger = logging.getLogger(__name__)


[docs]def parse_refresh(text): '''Parses text for HTTP Refresh URL. Returns: str, None ''' match = re.search(r'url\s*=(.+)', text, re.IGNORECASE) if match: url = match.group(1) if url.startswith('"'): url = url.strip('"') elif url.startswith("'"): url = url.strip("'") return clean_link_soup(url)
[docs]def urljoin_safe(base_url, url, allow_fragments=True): '''urljoin with warning log on error. Returns: str, None''' try: return wpull.url.urljoin( base_url, url, allow_fragments=allow_fragments ) except ValueError as error: _logger.warning(__( _('Unable to parse URL ‘{url}’: {error}.'), url=url, error=error ))
[docs]def is_likely_inline(link): '''Return whether the link is likely to be inline.''' file_type = mimetypes.guess_type(link, strict=False)[0] if file_type: top_level_type, subtype = file_type.split('/', 1) return top_level_type in ('image', 'video', 'audio') or subtype == 'javascript'
_mimetypes_db = mimetypes.MimeTypes() MIMETYPES = frozenset( itertools.chain( _mimetypes_db.types_map[0].values(), _mimetypes_db.types_map[1].values(), ['text/javascript'] ) ) ALPHANUMERIC_CHARS = frozenset(string.ascii_letters + string.digits) NUMERIC_CHARS = frozenset(string.digits) COMMON_TLD = frozenset(['com', 'org', 'net', 'int', 'edu', 'gov', 'mil']) HTML_TAGS = frozenset([ "a", "abbr", "acronym", "address", "applet", "area", "article", "aside", "audio", "b", "base", "basefont", "bdi", "bdo", "big", "blockquote", "body", "br", "button", "canvas", "caption", "center", "cite", "code", "col", "colgroup", "command", "datalist", "dd", "del", "details", "dfn", "dir", "div", "dl", "dt", "em", "embed", "fieldset", "figcaption", "figure", "font", "footer", "form", "frame", "frameset", "head", "header", "hgroup", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "html", "i", "iframe", "img", "input", "ins", "kbd", "keygen", "label", "legend", "li", "link", "map", "mark", "menu", "meta", "meter", "nav", "noframes", "noscript", "object", "ol", "optgroup", "option", "output", "p", "param", "pre", "progress", "q", "rp", "rt", "ruby", "s", "samp", "script", "section", "select", "small", "source", "span", "strike", "strong", "style", "sub", "summary", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "time", "title", "tr", "track", "tt", "u", "ul", "var", "video", "wbr" ]) FIRST_PART_TLD_PATTERN = re.compile(r'[^/][a-zA-Z0-9.-]+\.({})/.'.format('|'.join(COMMON_TLD)), re.IGNORECASE) # These "likely link" functions are based from # https://github.com/internetarchive/heritrix3/ # blob/339e6ec87a7041f49c710d1d0fb94be0ec972ee7/commons/src/ # main/java/org/archive/util/UriUtils.java @functools.lru_cache()