Source code for wpull.scraper.util

'''Misc functions.'''

import functools
import gettext
import itertools
import logging
import mimetypes
import re
import string

import wpull.url
from wpull.backport.logging import BraceMessage as __
from wpull.pipeline.item import LinkType

_ = gettext.gettext
_logger = logging.getLogger(__name__)


[docs]def parse_refresh(text):
    '''Parses text for HTTP Refresh URL.

    Returns:
        str, None
    '''
    match = re.search(r'url\s*=(.+)', text, re.IGNORECASE)

    if match:
        url = match.group(1)

        if url.startswith('"'):
            url = url.strip('"')
        elif url.startswith("'"):
            url = url.strip("'")

        return clean_link_soup(url)


[docs]def clean_link_soup(link):
    '''Strip whitespace from a link in HTML soup.

    Args:
        link (str): A string containing the link with lots of whitespace.

    The link is split into lines. For each line, leading and trailing
    whitespace is removed and tabs are removed throughout. The lines are
    concatenated and returned.

    For example, passing the ``href`` value of::

        <a href=" http://example.com/

                blog/entry/

            how smaug stole all the bitcoins.html
        ">

    will return
    ``http://example.com/blog/entry/how smaug stole all the bitcoins.html``.

    Returns:
        str: The cleaned link.
    '''
    return ''.join(
        [line.strip().replace('\t', '') for line in link.splitlines()]
    )


[docs]def urljoin_safe(base_url, url, allow_fragments=True):
    '''urljoin with warning log on error.

    Returns:
        str, None'''
    try:
        return wpull.url.urljoin(
            base_url, url, allow_fragments=allow_fragments
        )
    except ValueError as error:
        _logger.warning(__(
            _('Unable to parse URL ‘{url}’: {error}.'),
            url=url, error=error
        ))


[docs]def is_likely_inline(link):
    '''Return whether the link is likely to be inline.'''
    file_type = mimetypes.guess_type(link, strict=False)[0]

    if file_type:
        top_level_type, subtype = file_type.split('/', 1)

        return top_level_type in ('image', 'video', 'audio') or subtype == 'javascript'


_mimetypes_db = mimetypes.MimeTypes()
MIMETYPES = frozenset(
    itertools.chain(
        _mimetypes_db.types_map[0].values(),
        _mimetypes_db.types_map[1].values(),
        ['text/javascript']
    )
)
ALPHANUMERIC_CHARS = frozenset(string.ascii_letters + string.digits)
NUMERIC_CHARS = frozenset(string.digits)
COMMON_TLD = frozenset(['com', 'org', 'net', 'int', 'edu', 'gov', 'mil'])
HTML_TAGS = frozenset([
    "a", "abbr", "acronym", "address",
    "applet", "area", "article", "aside", "audio", "b",
    "base", "basefont", "bdi", "bdo", "big", "blockquote",
    "body", "br", "button", "canvas", "caption", "center",
    "cite", "code", "col", "colgroup", "command",
    "datalist", "dd", "del", "details", "dfn", "dir",
    "div", "dl", "dt", "em", "embed", "fieldset",
    "figcaption", "figure", "font", "footer", "form",
    "frame", "frameset", "head", "header", "hgroup", "h1",
    "h2", "h3", "h4", "h5", "h6", "hr", "html", "i",
    "iframe", "img", "input", "ins", "kbd", "keygen",
    "label", "legend", "li", "link", "map", "mark", "menu",
    "meta", "meter", "nav", "noframes", "noscript",
    "object", "ol", "optgroup", "option", "output", "p",
    "param", "pre", "progress", "q", "rp", "rt", "ruby",
    "s", "samp", "script", "section", "select", "small",
    "source", "span", "strike", "strong", "style", "sub",
    "summary", "sup", "table", "tbody", "td", "textarea",
    "tfoot", "th", "thead", "time", "title", "tr", "track",
    "tt", "u", "ul", "var", "video", "wbr"
    ])
FIRST_PART_TLD_PATTERN = re.compile(r'[^/][a-zA-Z0-9.-]+\.({})/.'.format('|'.join(COMMON_TLD)), re.IGNORECASE)


# These "likely link" functions are based from
# https://github.com/internetarchive/heritrix3/
# blob/339e6ec87a7041f49c710d1d0fb94be0ec972ee7/commons/src/
# main/java/org/archive/util/UriUtils.java


[docs]def is_likely_link(text):
    '''Return whether the text is likely to be a link.

    This function assumes that leading/trailing whitespace has already been
    removed.

    Returns:
        bool
    '''
    text = text.lower()

    # Check for absolute or relative URLs
    if (
        text.startswith('http://')
        or text.startswith('https://')
        or text.startswith('ftp://')
        or text.startswith('/')
        or text.startswith('//')
        or text.endswith('/')
        or text.startswith('../')
    ):
        return True

    # Check if it has a alphanumeric file extension and not a decimal number
    dummy, dot, file_extension = text.rpartition('.')

    if dot and file_extension and len(file_extension) <= 4:
        file_extension_set = frozenset(file_extension)

        if file_extension_set \
           and file_extension_set <= ALPHANUMERIC_CHARS \
           and not file_extension_set <= NUMERIC_CHARS:
            if file_extension in COMMON_TLD:
                return False

            file_type = mimetypes.guess_type(text, strict=False)[0]

            if file_type:
                return True
            else:
                return False


[docs]def is_unlikely_link(text):
    '''Return whether the text is likely to cause false positives.

    This function assumes that leading/trailing whitespace has already been
    removed.

    Returns:
        bool
    '''
    # Check for string concatenation in JavaScript
    if text[:1] in ',;+:' or text[-1:] in '.,;+:':
        return True

    # Check for unusual characters
    if re.search(r'''[\\$()'"[\]{}|<>`]''', text):
        return True

    if text[:1] == '.' \
       and not text.startswith('./') \
       and not text.startswith('../'):
        return True

    if text in ('/', '//'):
        return True

    if '//' in text and '://' not in text and not text.startswith('//'):
        return True

    # Forbid strings like mimetypes
    if text in MIMETYPES:
        return True

    tag_1, dummy, tag_2 = text.partition('.')
    if tag_1 in HTML_TAGS and tag_2 != 'html':
        return True

    # Forbid things where the first part of the path looks like a domain name
    if FIRST_PART_TLD_PATTERN.match(text):
        return True


@functools.lru_cache()
[docs]def identify_link_type(filename):
    '''Return link type guessed by filename extension.

    Returns:
        str: A value from :class:`.item.LinkType`.
    '''
    mime_type = mimetypes.guess_type(filename)[0]

    if not mime_type:
        return

    if mime_type == 'text/css':
        return LinkType.css
    elif mime_type == 'application/javascript':
        return LinkType.javascript
    elif mime_type == 'text/html' or mime_type.endswith('xml'):
        return LinkType.html
    elif mime_type.startswith('video') or \
            mime_type.startswith('image') or \
            mime_type.startswith('audio') or \
            mime_type.endswith('shockwave-flash'):
        return LinkType.media