hdx.scraper.base_scraper
BaseScraper Objects
class BaseScraper(ABC)
Base scraper class for scrapers to inherit
Arguments:
name
str - Name of scraperdatasetinfo
Dict - Information about datasetheaders
Dict[str, Tuple] - Headers to be oytput at each level_namesource_configuration
Dict - Configuration for sources. Defaults to empty dict (use defaults).reader
str - Reader to use. Defaults to "" (datasetinfo reader falling back on name).
setup
def setup(headers: Dict[str, Tuple], source_configuration: Dict = {}) -> None
Initialise member variables including name and headers which is of form:
{"national": (("School Closure",), ("impact
+type",)), ...},
Arguments:
headers
Dict[str, Tuple] - Headers to be output at each level_namesource_configuration
Dict - Configuration for sources. Defaults to empty dict (use defaults).
Returns:
None
initialise_values_sources
def initialise_values_sources(source_configuration: Dict = {}) -> None
Create values and sources member variables for inheriting scrapers to populate.
values will be of form:
{"national": ({"AFG": 1.2, "PSE": 1.4}, {"AFG": 123, "PSE": 241}, ...})}
sources will be of form:
{"national": [("food
-prices", "2022-07-15", "WFP", "https://data.humdata.org/dataset/global-wfp-food-prices"), ...]
Arguments:
source_configuration
Dict - Configuration for sources. Defaults to empty dict (use defaults).
Returns:
None
get_reader
def get_reader(name: Optional[str] = None)
Get reader given name if provided or using name member variable if not.
Arguments:
name
str - Name of scraper
Returns:
None
get_headers
def get_headers(level: str) -> Optional[Tuple[Tuple]]
Get headers for a particular level_name like national or subnational. Will be
of form: (("School Closure",), ("impact
+type",))
Arguments:
level
str - Level to get like national, subnational or single
Returns:
Optional[Tuple[Tuple]]
- Scraper headers or None
get_values
def get_values(level: str) -> Optional[Tuple]
Get values for a particular level_name like national or subnational. Will be of form: ({"AFG": 1.2, "PSE": 1.4}, {"AFG": 123, "PSE": 241}, ...})}
Arguments:
level
str - Level for which to get headers
Returns:
Optional[Tuple]
- Scraper values or None
add_sources
def add_sources() -> None
Adds sources for a particular level_name
Returns:
None
add_hxltag_source
def add_hxltag_source(hxltag: str,
datasetinfo: Optional[Dict] = None,
key: Optional[str] = None) -> None
Adds source identified by HXL hashtag under a particular key.
Arguments:
hxltag
str - HXL hashtag to use for sourcedatasetinfo
Optional[Dict] - Information about dataset. Defaults to None (use self.datasetinfo).key
Optional[str] - Key under which to add source. Defaults to None (use scraper name).
Returns:
None
add_hxltag_sources
def add_hxltag_sources(hxltags: ListTuple[str],
datasetinfo: Optional[Dict] = None,
key: Optional[str] = None,
suffix_attributes: Optional[ListTuple] = None) -> None
Adds sources identified by HXL hashtags under a particular key.
Arguments:
hxltags
ListTuple[str] - HXL hashtags to use for sourcesdatasetinfo
Optional[Dict] - Information about dataset. Defaults to None (use self.datasetinfo).key
Optional[str] - Key under which to add source. Defaults to None (use scraper name).suffix_attributes
Optional[ListTuple] - List of suffix attributes to append to HXL hashtags eg. iso3 codes
Returns:
None
get_sources
def get_sources(level: str) -> Optional[List[Tuple]]
Get values for a particular level_name like national or subnational. Will be of
form:
[("food
-prices", "2022-07-15", "WFP", "https://data.humdata.org/dataset/global-wfp-food-prices"), ...]
Arguments:
level
str - Level to get like national, subnational or single
Returns:
Optional[List[Tuple]]
- Scraper sources or None
add_source_urls
def add_source_urls() -> None
Add source urls from the datasetinfo member variable
Returns:
None
get_source_urls
def get_source_urls() -> Set[str]
Get source urls
Returns:
Set[str]
- Source urls
get_hapi_dataset_metadata
def get_hapi_dataset_metadata() -> Optional[Dict]
Get HAPI dataset metadata
Returns:
Optional[Dict]
- HAPI dataset metadata
get_hapi_resource_metadata
def get_hapi_resource_metadata() -> Optional[Dict]
Get HAPI resource metadata
Returns:
Optional[Dict]
- HAPI resource metadata
add_population
def add_population() -> None
Add population data by looking for the population
HXL hashtag among the
headers and pulling out the associated values
Returns:
None
run
@abstractmethod
def run() -> None
Run scraper. Must be overridden.
Returns:
None
run_after_fallbacks
def run_after_fallbacks() -> None
Executed after fallbacks are used. Can be overridden if needed.
Returns:
None
pre_run
def pre_run() -> None
Executed before running. Can be overridden if needed.
Returns:
None
post_run
def post_run() -> None
Executed after running. Can be overridden if needed.
Returns:
None