diff --git a/tools/python/descriptions/__main__.py b/tools/python/descriptions/__main__.py index f74a1fac7..d0a125d35 100644 --- a/tools/python/descriptions/__main__.py +++ b/tools/python/descriptions/__main__.py @@ -12,58 +12,52 @@ from descriptions.descriptions_downloader import log def parse_args(): - parser = argparse.ArgumentParser(description="Download wiki pages.") - parser.add_argument( - "--output_dir", metavar="PATH", type=str, help="Output dir for saving pages" + parser = argparse.ArgumentParser(description="Download wiki pages.", usage="python3 -m descriptions " + "--output_dir ~/maps_build/descriptions " + "--wikipedia ~/maps_build/wiki_urls.txt " + "--wikidata ~/maps_build/id_to_wikidata.csv " + "--langs en de fr es ru tr" ) parser.add_argument( - "--popularity", - metavar="PATH", - type=str, - help="File with popular object ids for which we " - "download wikipedia data. If not given, download " - "for all objects.", + "--output_dir", metavar="PATH", type=str, help="Output dir for saving pages." ) parser.add_argument( - "--wikipedia", - metavar="PATH", - type=str, - required=True, - help="Input file with wikipedia url.", + "--popularity", metavar="PATH", type=str, + help="File with popular object ids with wikipedia data to download. If not given, download all objects.", + ) + parser.add_argument( + "--wikipedia", metavar="PATH", type=str, required=True, help="Input file with wikipedia url.", ) parser.add_argument( "--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids." ) - parser.add_argument( - "--langs", - metavar="LANGS", - type=str, - nargs="+", - action="append", - help="Languages ​​for pages. If left blank, pages in all " - "available languages ​​will be loaded.", + parser.add_argument("--langs", metavar="LANGS", type=str, nargs="+", action="append", + help="Languages for pages. If left blank, pages in all available languages will be loaded.", ) return parser.parse_args() def main(): log.setLevel(logging.WARNING) - wikipediaapi.log.setLevel(logging.WARNING) + wikipediaapi.log.setLevel(logging.DEBUG) + args = parse_args() wikipedia_file = args.wikipedia wikidata_file = args.wikidata output_dir = args.output_dir popularity_file = args.popularity langs = list(itertools.chain.from_iterable(args.langs)) + os.makedirs(output_dir, exist_ok=True) checker = check_and_get_checker(popularity_file) download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker) + if wikidata_file is None: log.warning(f"Wikidata file not set.") elif os.path.exists(wikidata_file): download_from_wikidata_tags(wikidata_file, output_dir, langs, checker) else: - log.warning(f"Wikidata ({wikidata_file}) file not set.") + log.warning(f"Wikidata ({wikidata_file}) file not found.") main() diff --git a/tools/python/descriptions/descriptions_downloader.py b/tools/python/descriptions/descriptions_downloader.py index 996389426..957511e94 100644 --- a/tools/python/descriptions/descriptions_downloader.py +++ b/tools/python/descriptions/descriptions_downloader.py @@ -1,4 +1,3 @@ -import functools import json import logging import os @@ -9,7 +8,7 @@ import urllib.error import urllib.parse import http.client -from multiprocessing.pool import ThreadPool +from concurrent.futures import ThreadPoolExecutor import htmlmin import requests @@ -17,8 +16,7 @@ import wikipediaapi from bs4 import BeautifulSoup from wikidata.client import Client -from descriptions.exceptions import GettingError -from descriptions.exceptions import ParseError +from descriptions.exceptions import GettingError, ParseError """ This script downloads Wikipedia pages for different languages. @@ -26,9 +24,8 @@ This script downloads Wikipedia pages for different languages. log = logging.getLogger(__name__) WORKERS = 80 -CHUNK_SIZE = 16 -REQUEST_ATTEMPTS = 32 -ATTEMPTS_PAUSE_MS = 4000 +REQUEST_ATTEMPTS = 8 +ATTEMPTS_PAUSE_SECONDS = 4.0 HEADERS = {f"h{x}" for x in range(1, 7)} BAD_SECTIONS = { @@ -77,9 +74,8 @@ def try_get(obj, prop, *args, **kwargs): requests.exceptions.ReadTimeout, json.decoder.JSONDecodeError, http.client.HTTPException, - ): - time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS)) - attempts -= 1 + ) as e: + log.debug(e) except urllib.error.HTTPError as e: if e.code == 404: raise GettingError(f"Page not found {e.msg}") @@ -88,9 +84,10 @@ def try_get(obj, prop, *args, **kwargs): except urllib.error.URLError: raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}") - raise GettingError( - f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent" - ) + time.sleep(random.uniform(0.0, ATTEMPTS_PAUSE_SECONDS)) + attempts -= 1 + + raise GettingError(f"Getting {prop} field failed") def read_popularity(path): @@ -109,7 +106,6 @@ def read_popularity(path): def should_download_page(popularity_set): - @functools.wraps(popularity_set) def wrapped(ident): return popularity_set is None or ident in popularity_set @@ -177,18 +173,21 @@ def download(directory, url): try: lang, page_name = get_page_info(url) except ParseError: - log.exception("Parsing failed. {url} is incorrect.") + log.exception(f"Parsing failed. {url} is incorrect.") return None + path = os.path.join(directory, f"{lang}.html") if os.path.exists(path): - log.warning(f"{path} already exists.") + log.debug(f"{path} already exists.") return None + page = get_wiki_page(lang, page_name) try: text = try_get(page, "text") - except GettingError: - log.exception(f"Error: page is not downloaded {page_name}.") + except GettingError as e: + log.exception(f"Error: page {page_name} is not downloaded for lang {lang} and url {url} ({e}).") return None + page_size = len(text) if page_size > 0: os.makedirs(directory, exist_ok=True) @@ -198,15 +197,15 @@ def download(directory, url): file.write(text) else: log.warning(f"Page {url} is empty. It has not been saved.") + return text def get_wiki_langs(url): lang, page_name = get_page_info(url) page = get_wiki_page(lang, page_name) - curr_lang = [ - (lang, url), - ] + + curr_lang = [(lang, url)] try: langlinks = try_get(page, "langlinks") return ( @@ -214,7 +213,7 @@ def get_wiki_langs(url): + curr_lang ) except GettingError as e: - log.warning(f"Error: no languages for {url} ({e}).") + log.exception(f"Error: no languages for page {page_name} with url {url} ({e}).") return curr_lang @@ -230,12 +229,12 @@ def download_all_from_wikipedia(path, url, langs): def wikipedia_worker(output_dir, checker, langs): - @functools.wraps(wikipedia_worker) def wrapped(line): if not line.strip(): return try: - mwm_path, ident, url = line.split("\t") + # First param is mwm_path, which added this line entry. + _, ident, url = line.split("\t") ident = int(ident) if not checker(ident): return @@ -252,11 +251,9 @@ def wikipedia_worker(output_dir, checker, langs): def download_from_wikipedia_tags(input_file, output_dir, langs, checker): with open(input_file) as file: - _ = file.readline() - pool = ThreadPool(processes=WORKERS) - pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE) - pool.close() - pool.join() + _ = file.readline() # skip header + with ThreadPoolExecutor(WORKERS) as pool: + pool.map(wikipedia_worker(output_dir, checker, langs), file) def get_wikidata_urls(entity, langs): @@ -273,7 +270,6 @@ def get_wikidata_urls(entity, langs): def wikidata_worker(output_dir, checker, langs): - @functools.wraps(wikidata_worker) def wrapped(line): if not line.strip(): return @@ -306,10 +302,8 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker): wikidata_output_dir = os.path.join(output_dir, "wikidata") os.makedirs(wikidata_output_dir, exist_ok=True) with open(input_file) as file: - with ThreadPool(processes=WORKERS) as pool: - pool.map( - wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE - ) + with ThreadPoolExecutor(WORKERS) as pool: + pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file) def check_and_get_checker(popularity_file):