Source code for wpull.database.base

'''Base table class.'''
import abc

import typing
from typing import Iterator, Optional

from wpull.pipeline.item import URLRecord, URLProperties, URLData, Status, \
    URLResult


[docs]class DatabaseError(Exception): '''Any database error.'''
[docs]class NotFound(DatabaseError): '''Item not found in the table.'''
AddURLInfo = typing.NamedTuple('_AddURLInfo', [ ('url', str), ('properties', URLProperties), ('data', URLData) ])
[docs]class BaseURLTable(object, metaclass=abc.ABCMeta): '''URL table.''' @abc.abstractmethod
[docs] def count(self) -> int: '''Return the number of URLs in the table. This call may be expensive. '''
@abc.abstractmethod
[docs] def get_one(self, url: str) -> URLRecord: '''Return a URLRecord for the URL. Raises: NotFound '''
[docs] def contains(self, url: str): '''Return whether the URL is in the table.''' try: self.get_one(url) except NotFound: return False else: return True
@abc.abstractmethod
[docs] def get_all(self) -> Iterator[URLRecord]: '''Return all URLRecord.'''
@abc.abstractmethod
[docs] def add_many(self, new_urls: Iterator[AddURLInfo]) -> Iterator[str]: '''Add the URLs to the table. Args: new_urls: URLs to be added. Returns: The URLs added. Useful for tracking duplicates. '''
[docs] def add_one(self, url: str, url_properties: Optional[URLProperties]=None, url_data: Optional[URLData]=None): '''Add a single URL to the table. Args: url: The URL to be added url_properties: Additional values to be saved url_data: Additional data to be saved ''' self.add_many([AddURLInfo(url, url_properties, url_data)])
@abc.abstractmethod
[docs] def check_out(self, filter_status: Status, filter_level: Optional[int]=None) -> URLRecord: '''Find a URL, mark it in progress, and return it. Args: filter_status: Gets first item with given status. filter_level: Gets item with `filter_level` or lower. Raises: NotFound '''
@abc.abstractmethod
[docs] def check_in(self, url: str, new_status: Status, increment_try_count: bool=True, url_result: Optional[URLResult]=None): '''Update record for processed URL. Args: url: The URL. new_status: Update the item status to `new_status`. increment_try_count: Whether to increment the try counter for the URL. url_result: Additional values. '''
@abc.abstractmethod
[docs] def update_one(self, url, **kwargs): '''Arbitrarily update values for a URL.'''
@abc.abstractmethod
[docs] def release(self): '''Mark any ``in_progress`` URLs to ``todo`` status.'''
@abc.abstractmethod
[docs] def remove_many(self, urls): '''Remove the URLs from the database.'''
[docs] def remove_one(self, url): '''Remove a URL from the database.''' self.remove_many([url])
@abc.abstractmethod
[docs] def close(self): '''Run any clean-up actions and close the table.'''
@abc.abstractmethod
[docs] def add_visits(self, visits): '''Add visited URLs from CDX file. Args: visits (iterable): An iterable of items. Each item is a tuple containing a URL, the WARC ID, and the payload digest. '''
@abc.abstractmethod
[docs] def get_revisit_id(self, url, payload_digest): '''Return the WARC ID corresponding to the visit. Returns: str, None '''
@abc.abstractmethod
[docs] def get_hostnames(self): '''Return list of hostnames '''
@abc.abstractmethod
[docs] def get_root_url_todo_count(self) -> int: pass
@abc.abstractmethod
[docs] def convert_check_out(self) -> (int, URLRecord): pass
@abc.abstractmethod
[docs] def convert_check_in(self, file_id: int, status: Status): pass