Updated scripts.

Signed-off-by: Viktor Govako <viktor.govako@gmail.com>
This commit is contained in:
Viktor Govako
2022-05-22 09:31:39 +03:00
committed by zyphlar
parent 0d08a53224
commit 2bfcf0089b
2 changed files with 46 additions and 58 deletions

View File

@@ -12,58 +12,52 @@ from descriptions.descriptions_downloader import log
def parse_args(): def parse_args():
parser = argparse.ArgumentParser(description="Download wiki pages.") parser = argparse.ArgumentParser(description="Download wiki pages.", usage="python3 -m descriptions "
parser.add_argument( "--output_dir ~/maps_build/descriptions "
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages" "--wikipedia ~/maps_build/wiki_urls.txt "
"--wikidata ~/maps_build/id_to_wikidata.csv "
"--langs en de fr es ru tr"
) )
parser.add_argument( parser.add_argument(
"--popularity", "--output_dir", metavar="PATH", type=str, help="Output dir for saving pages."
metavar="PATH",
type=str,
help="File with popular object ids for which we "
"download wikipedia data. If not given, download "
"for all objects.",
) )
parser.add_argument( parser.add_argument(
"--wikipedia", "--popularity", metavar="PATH", type=str,
metavar="PATH", help="File with popular object ids with wikipedia data to download. If not given, download all objects.",
type=str, )
required=True, parser.add_argument(
help="Input file with wikipedia url.", "--wikipedia", metavar="PATH", type=str, required=True, help="Input file with wikipedia url.",
) )
parser.add_argument( parser.add_argument(
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids." "--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
) )
parser.add_argument( parser.add_argument("--langs", metavar="LANGS", type=str, nargs="+", action="append",
"--langs", help="Languages for pages. If left blank, pages in all available languages will be loaded.",
metavar="LANGS",
type=str,
nargs="+",
action="append",
help="Languages for pages. If left blank, pages in all "
"available languages will be loaded.",
) )
return parser.parse_args() return parser.parse_args()
def main(): def main():
log.setLevel(logging.WARNING) log.setLevel(logging.WARNING)
wikipediaapi.log.setLevel(logging.WARNING) wikipediaapi.log.setLevel(logging.DEBUG)
args = parse_args() args = parse_args()
wikipedia_file = args.wikipedia wikipedia_file = args.wikipedia
wikidata_file = args.wikidata wikidata_file = args.wikidata
output_dir = args.output_dir output_dir = args.output_dir
popularity_file = args.popularity popularity_file = args.popularity
langs = list(itertools.chain.from_iterable(args.langs)) langs = list(itertools.chain.from_iterable(args.langs))
os.makedirs(output_dir, exist_ok=True) os.makedirs(output_dir, exist_ok=True)
checker = check_and_get_checker(popularity_file) checker = check_and_get_checker(popularity_file)
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker) download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
if wikidata_file is None: if wikidata_file is None:
log.warning(f"Wikidata file not set.") log.warning(f"Wikidata file not set.")
elif os.path.exists(wikidata_file): elif os.path.exists(wikidata_file):
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker) download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
else: else:
log.warning(f"Wikidata ({wikidata_file}) file not set.") log.warning(f"Wikidata ({wikidata_file}) file not found.")
main() main()

View File

@@ -1,4 +1,3 @@
import functools
import json import json
import logging import logging
import os import os
@@ -9,7 +8,7 @@ import urllib.error
import urllib.parse import urllib.parse
import http.client import http.client
from multiprocessing.pool import ThreadPool from concurrent.futures import ThreadPoolExecutor
import htmlmin import htmlmin
import requests import requests
@@ -17,8 +16,7 @@ import wikipediaapi
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from wikidata.client import Client from wikidata.client import Client
from descriptions.exceptions import GettingError from descriptions.exceptions import GettingError, ParseError
from descriptions.exceptions import ParseError
""" """
This script downloads Wikipedia pages for different languages. This script downloads Wikipedia pages for different languages.
@@ -26,9 +24,8 @@ This script downloads Wikipedia pages for different languages.
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
WORKERS = 80 WORKERS = 80
CHUNK_SIZE = 16 REQUEST_ATTEMPTS = 8
REQUEST_ATTEMPTS = 32 ATTEMPTS_PAUSE_SECONDS = 4.0
ATTEMPTS_PAUSE_MS = 4000
HEADERS = {f"h{x}" for x in range(1, 7)} HEADERS = {f"h{x}" for x in range(1, 7)}
BAD_SECTIONS = { BAD_SECTIONS = {
@@ -77,9 +74,8 @@ def try_get(obj, prop, *args, **kwargs):
requests.exceptions.ReadTimeout, requests.exceptions.ReadTimeout,
json.decoder.JSONDecodeError, json.decoder.JSONDecodeError,
http.client.HTTPException, http.client.HTTPException,
): ) as e:
time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS)) log.debug(e)
attempts -= 1
except urllib.error.HTTPError as e: except urllib.error.HTTPError as e:
if e.code == 404: if e.code == 404:
raise GettingError(f"Page not found {e.msg}") raise GettingError(f"Page not found {e.msg}")
@@ -88,9 +84,10 @@ def try_get(obj, prop, *args, **kwargs):
except urllib.error.URLError: except urllib.error.URLError:
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}") raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
raise GettingError( time.sleep(random.uniform(0.0, ATTEMPTS_PAUSE_SECONDS))
f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent" attempts -= 1
)
raise GettingError(f"Getting {prop} field failed")
def read_popularity(path): def read_popularity(path):
@@ -109,7 +106,6 @@ def read_popularity(path):
def should_download_page(popularity_set): def should_download_page(popularity_set):
@functools.wraps(popularity_set)
def wrapped(ident): def wrapped(ident):
return popularity_set is None or ident in popularity_set return popularity_set is None or ident in popularity_set
@@ -177,18 +173,21 @@ def download(directory, url):
try: try:
lang, page_name = get_page_info(url) lang, page_name = get_page_info(url)
except ParseError: except ParseError:
log.exception("Parsing failed. {url} is incorrect.") log.exception(f"Parsing failed. {url} is incorrect.")
return None return None
path = os.path.join(directory, f"{lang}.html") path = os.path.join(directory, f"{lang}.html")
if os.path.exists(path): if os.path.exists(path):
log.warning(f"{path} already exists.") log.debug(f"{path} already exists.")
return None return None
page = get_wiki_page(lang, page_name) page = get_wiki_page(lang, page_name)
try: try:
text = try_get(page, "text") text = try_get(page, "text")
except GettingError: except GettingError as e:
log.exception(f"Error: page is not downloaded {page_name}.") log.exception(f"Error: page {page_name} is not downloaded for lang {lang} and url {url} ({e}).")
return None return None
page_size = len(text) page_size = len(text)
if page_size > 0: if page_size > 0:
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
@@ -198,15 +197,15 @@ def download(directory, url):
file.write(text) file.write(text)
else: else:
log.warning(f"Page {url} is empty. It has not been saved.") log.warning(f"Page {url} is empty. It has not been saved.")
return text return text
def get_wiki_langs(url): def get_wiki_langs(url):
lang, page_name = get_page_info(url) lang, page_name = get_page_info(url)
page = get_wiki_page(lang, page_name) page = get_wiki_page(lang, page_name)
curr_lang = [
(lang, url), curr_lang = [(lang, url)]
]
try: try:
langlinks = try_get(page, "langlinks") langlinks = try_get(page, "langlinks")
return ( return (
@@ -214,7 +213,7 @@ def get_wiki_langs(url):
+ curr_lang + curr_lang
) )
except GettingError as e: except GettingError as e:
log.warning(f"Error: no languages for {url} ({e}).") log.exception(f"Error: no languages for page {page_name} with url {url} ({e}).")
return curr_lang return curr_lang
@@ -230,12 +229,12 @@ def download_all_from_wikipedia(path, url, langs):
def wikipedia_worker(output_dir, checker, langs): def wikipedia_worker(output_dir, checker, langs):
@functools.wraps(wikipedia_worker)
def wrapped(line): def wrapped(line):
if not line.strip(): if not line.strip():
return return
try: try:
mwm_path, ident, url = line.split("\t") # First param is mwm_path, which added this line entry.
_, ident, url = line.split("\t")
ident = int(ident) ident = int(ident)
if not checker(ident): if not checker(ident):
return return
@@ -252,11 +251,9 @@ def wikipedia_worker(output_dir, checker, langs):
def download_from_wikipedia_tags(input_file, output_dir, langs, checker): def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
with open(input_file) as file: with open(input_file) as file:
_ = file.readline() _ = file.readline() # skip header
pool = ThreadPool(processes=WORKERS) with ThreadPoolExecutor(WORKERS) as pool:
pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE) pool.map(wikipedia_worker(output_dir, checker, langs), file)
pool.close()
pool.join()
def get_wikidata_urls(entity, langs): def get_wikidata_urls(entity, langs):
@@ -273,7 +270,6 @@ def get_wikidata_urls(entity, langs):
def wikidata_worker(output_dir, checker, langs): def wikidata_worker(output_dir, checker, langs):
@functools.wraps(wikidata_worker)
def wrapped(line): def wrapped(line):
if not line.strip(): if not line.strip():
return return
@@ -306,10 +302,8 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker):
wikidata_output_dir = os.path.join(output_dir, "wikidata") wikidata_output_dir = os.path.join(output_dir, "wikidata")
os.makedirs(wikidata_output_dir, exist_ok=True) os.makedirs(wikidata_output_dir, exist_ok=True)
with open(input_file) as file: with open(input_file) as file:
with ThreadPool(processes=WORKERS) as pool: with ThreadPoolExecutor(WORKERS) as pool:
pool.map( pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file)
wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE
)
def check_and_get_checker(popularity_file): def check_and_get_checker(popularity_file):