hdx.scraper.base_scraper

BaseScraper Objects

class BaseScraper(ABC)

[view_source]

Base scraper class for scrapers to inherit

Arguments:

  • name str - Name of scraper
  • datasetinfo Dict - Information about dataset
  • headers Dict[str, Tuple] - Headers to be oytput at each level_name
  • source_configuration Dict - Configuration for sources. Defaults to empty dict (use defaults).
  • reader str - Reader to use. Defaults to "" (datasetinfo reader falling back on name).

setup

def setup(headers: Dict[str, Tuple], source_configuration: Dict = {}) -> None

[view_source]

Initialise member variables including name and headers which is of form: {"national": (("School Closure",), ("impact+type",)), ...},

Arguments:

  • headers Dict[str, Tuple] - Headers to be output at each level_name
  • source_configuration Dict - Configuration for sources. Defaults to empty dict (use defaults).

Returns:

None

initialise_values_sources

def initialise_values_sources(source_configuration: Dict = {}) -> None

[view_source]

Create values and sources member variables for inheriting scrapers to populate. values will be of form: {"national": ({"AFG": 1.2, "PSE": 1.4}, {"AFG": 123, "PSE": 241}, ...})} sources will be of form: {"national": [("food-prices", "2022-07-15", "WFP", "https://data.humdata.org/dataset/global-wfp-food-prices"), ...]

Arguments:

  • source_configuration Dict - Configuration for sources. Defaults to empty dict (use defaults).

Returns:

None

get_reader

def get_reader(name: Optional[str] = None)

[view_source]

Get reader given name if provided or using name member variable if not.

Arguments:

  • name str - Name of scraper

Returns:

None

get_headers

def get_headers(level: str) -> Optional[Tuple[Tuple]]

[view_source]

Get headers for a particular level_name like national or subnational. Will be of form: (("School Closure",), ("impact+type",))

Arguments:

  • level str - Level to get like national, subnational or single

Returns:

  • Optional[Tuple[Tuple]] - Scraper headers or None

get_values

def get_values(level: str) -> Optional[Tuple]

[view_source]

Get values for a particular level_name like national or subnational. Will be of form: ({"AFG": 1.2, "PSE": 1.4}, {"AFG": 123, "PSE": 241}, ...})}

Arguments:

  • level str - Level for which to get headers

Returns:

  • Optional[Tuple] - Scraper values or None

add_sources

def add_sources() -> None

[view_source]

Adds sources for a particular level_name

Returns:

None

add_hxltag_source

def add_hxltag_source(hxltag: str,
                      datasetinfo: Optional[Dict] = None,
                      key: Optional[str] = None) -> None

[view_source]

Adds source identified by HXL hashtag under a particular key.

Arguments:

  • hxltag str - HXL hashtag to use for source
  • datasetinfo Optional[Dict] - Information about dataset. Defaults to None (use self.datasetinfo).
  • key Optional[str] - Key under which to add source. Defaults to None (use scraper name).

Returns:

None

add_hxltag_sources

def add_hxltag_sources(hxltags: ListTuple[str],
                       datasetinfo: Optional[Dict] = None,
                       key: Optional[str] = None,
                       suffix_attributes: Optional[ListTuple] = None) -> None

[view_source]

Adds sources identified by HXL hashtags under a particular key.

Arguments:

  • hxltags ListTuple[str] - HXL hashtags to use for sources
  • datasetinfo Optional[Dict] - Information about dataset. Defaults to None (use self.datasetinfo).
  • key Optional[str] - Key under which to add source. Defaults to None (use scraper name).
  • suffix_attributes Optional[ListTuple] - List of suffix attributes to append to HXL hashtags eg. iso3 codes

Returns:

None

get_sources

def get_sources(level: str) -> Optional[List[Tuple]]

[view_source]

Get values for a particular level_name like national or subnational. Will be of form: [("food-prices", "2022-07-15", "WFP", "https://data.humdata.org/dataset/global-wfp-food-prices"), ...]

Arguments:

  • level str - Level to get like national, subnational or single

Returns:

  • Optional[List[Tuple]] - Scraper sources or None

add_source_urls

def add_source_urls() -> None

[view_source]

Add source urls from the datasetinfo member variable

Returns:

None

get_source_urls

def get_source_urls() -> Set[str]

[view_source]

Get source urls

Returns:

  • Set[str] - Source urls

get_hapi_dataset_metadata

def get_hapi_dataset_metadata() -> Optional[Dict]

[view_source]

Get HAPI dataset metadata

Returns:

  • Optional[Dict] - HAPI dataset metadata

get_hapi_resource_metadata

def get_hapi_resource_metadata() -> Optional[Dict]

[view_source]

Get HAPI resource metadata

Returns:

  • Optional[Dict] - HAPI resource metadata

add_population

def add_population() -> None

[view_source]

Add population data by looking for the population HXL hashtag among the headers and pulling out the associated values

Returns:

None

run

@abstractmethod
def run() -> None

[view_source]

Run scraper. Must be overridden.

Returns:

None

run_after_fallbacks

def run_after_fallbacks() -> None

[view_source]

Executed after fallbacks are used. Can be overridden if needed.

Returns:

None

pre_run

def pre_run() -> None

[view_source]

Executed before running. Can be overridden if needed.

Returns:

None

post_run

def post_run() -> None

[view_source]

Executed after running. Can be overridden if needed.

Returns:

None