mirror of
https://codeberg.org/comaps/comaps
synced 2025-12-19 13:03:36 +00:00
Updated scripts.
Signed-off-by: Viktor Govako <viktor.govako@gmail.com>
This commit is contained in:
@@ -12,58 +12,52 @@ from descriptions.descriptions_downloader import log
|
|||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
parser = argparse.ArgumentParser(description="Download wiki pages.")
|
parser = argparse.ArgumentParser(description="Download wiki pages.", usage="python3 -m descriptions "
|
||||||
parser.add_argument(
|
"--output_dir ~/maps_build/descriptions "
|
||||||
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages"
|
"--wikipedia ~/maps_build/wiki_urls.txt "
|
||||||
|
"--wikidata ~/maps_build/id_to_wikidata.csv "
|
||||||
|
"--langs en de fr es ru tr"
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--popularity",
|
"--output_dir", metavar="PATH", type=str, help="Output dir for saving pages."
|
||||||
metavar="PATH",
|
|
||||||
type=str,
|
|
||||||
help="File with popular object ids for which we "
|
|
||||||
"download wikipedia data. If not given, download "
|
|
||||||
"for all objects.",
|
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--wikipedia",
|
"--popularity", metavar="PATH", type=str,
|
||||||
metavar="PATH",
|
help="File with popular object ids with wikipedia data to download. If not given, download all objects.",
|
||||||
type=str,
|
)
|
||||||
required=True,
|
parser.add_argument(
|
||||||
help="Input file with wikipedia url.",
|
"--wikipedia", metavar="PATH", type=str, required=True, help="Input file with wikipedia url.",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
"--wikidata", metavar="PATH", type=str, help="Input file with wikidata ids."
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument("--langs", metavar="LANGS", type=str, nargs="+", action="append",
|
||||||
"--langs",
|
help="Languages for pages. If left blank, pages in all available languages will be loaded.",
|
||||||
metavar="LANGS",
|
|
||||||
type=str,
|
|
||||||
nargs="+",
|
|
||||||
action="append",
|
|
||||||
help="Languages for pages. If left blank, pages in all "
|
|
||||||
"available languages will be loaded.",
|
|
||||||
)
|
)
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
log.setLevel(logging.WARNING)
|
log.setLevel(logging.WARNING)
|
||||||
wikipediaapi.log.setLevel(logging.WARNING)
|
wikipediaapi.log.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
args = parse_args()
|
args = parse_args()
|
||||||
wikipedia_file = args.wikipedia
|
wikipedia_file = args.wikipedia
|
||||||
wikidata_file = args.wikidata
|
wikidata_file = args.wikidata
|
||||||
output_dir = args.output_dir
|
output_dir = args.output_dir
|
||||||
popularity_file = args.popularity
|
popularity_file = args.popularity
|
||||||
langs = list(itertools.chain.from_iterable(args.langs))
|
langs = list(itertools.chain.from_iterable(args.langs))
|
||||||
|
|
||||||
os.makedirs(output_dir, exist_ok=True)
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
checker = check_and_get_checker(popularity_file)
|
checker = check_and_get_checker(popularity_file)
|
||||||
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
|
download_from_wikipedia_tags(wikipedia_file, output_dir, langs, checker)
|
||||||
|
|
||||||
if wikidata_file is None:
|
if wikidata_file is None:
|
||||||
log.warning(f"Wikidata file not set.")
|
log.warning(f"Wikidata file not set.")
|
||||||
elif os.path.exists(wikidata_file):
|
elif os.path.exists(wikidata_file):
|
||||||
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
|
download_from_wikidata_tags(wikidata_file, output_dir, langs, checker)
|
||||||
else:
|
else:
|
||||||
log.warning(f"Wikidata ({wikidata_file}) file not set.")
|
log.warning(f"Wikidata ({wikidata_file}) file not found.")
|
||||||
|
|
||||||
|
|
||||||
main()
|
main()
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
import functools
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -9,7 +8,7 @@ import urllib.error
|
|||||||
import urllib.parse
|
import urllib.parse
|
||||||
import http.client
|
import http.client
|
||||||
|
|
||||||
from multiprocessing.pool import ThreadPool
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
import htmlmin
|
import htmlmin
|
||||||
import requests
|
import requests
|
||||||
@@ -17,8 +16,7 @@ import wikipediaapi
|
|||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from wikidata.client import Client
|
from wikidata.client import Client
|
||||||
|
|
||||||
from descriptions.exceptions import GettingError
|
from descriptions.exceptions import GettingError, ParseError
|
||||||
from descriptions.exceptions import ParseError
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
This script downloads Wikipedia pages for different languages.
|
This script downloads Wikipedia pages for different languages.
|
||||||
@@ -26,9 +24,8 @@ This script downloads Wikipedia pages for different languages.
|
|||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
WORKERS = 80
|
WORKERS = 80
|
||||||
CHUNK_SIZE = 16
|
REQUEST_ATTEMPTS = 8
|
||||||
REQUEST_ATTEMPTS = 32
|
ATTEMPTS_PAUSE_SECONDS = 4.0
|
||||||
ATTEMPTS_PAUSE_MS = 4000
|
|
||||||
|
|
||||||
HEADERS = {f"h{x}" for x in range(1, 7)}
|
HEADERS = {f"h{x}" for x in range(1, 7)}
|
||||||
BAD_SECTIONS = {
|
BAD_SECTIONS = {
|
||||||
@@ -77,9 +74,8 @@ def try_get(obj, prop, *args, **kwargs):
|
|||||||
requests.exceptions.ReadTimeout,
|
requests.exceptions.ReadTimeout,
|
||||||
json.decoder.JSONDecodeError,
|
json.decoder.JSONDecodeError,
|
||||||
http.client.HTTPException,
|
http.client.HTTPException,
|
||||||
):
|
) as e:
|
||||||
time.sleep(random.uniform(0.0, 1.0 / 1000.0 * ATTEMPTS_PAUSE_MS))
|
log.debug(e)
|
||||||
attempts -= 1
|
|
||||||
except urllib.error.HTTPError as e:
|
except urllib.error.HTTPError as e:
|
||||||
if e.code == 404:
|
if e.code == 404:
|
||||||
raise GettingError(f"Page not found {e.msg}")
|
raise GettingError(f"Page not found {e.msg}")
|
||||||
@@ -88,9 +84,10 @@ def try_get(obj, prop, *args, **kwargs):
|
|||||||
except urllib.error.URLError:
|
except urllib.error.URLError:
|
||||||
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
|
raise GettingError(f"URLError: {obj}, {prop}, {args}, {kwargs}")
|
||||||
|
|
||||||
raise GettingError(
|
time.sleep(random.uniform(0.0, ATTEMPTS_PAUSE_SECONDS))
|
||||||
f"Getting {prop} field failed. " f"All {REQUEST_ATTEMPTS} attempts are spent"
|
attempts -= 1
|
||||||
)
|
|
||||||
|
raise GettingError(f"Getting {prop} field failed")
|
||||||
|
|
||||||
|
|
||||||
def read_popularity(path):
|
def read_popularity(path):
|
||||||
@@ -109,7 +106,6 @@ def read_popularity(path):
|
|||||||
|
|
||||||
|
|
||||||
def should_download_page(popularity_set):
|
def should_download_page(popularity_set):
|
||||||
@functools.wraps(popularity_set)
|
|
||||||
def wrapped(ident):
|
def wrapped(ident):
|
||||||
return popularity_set is None or ident in popularity_set
|
return popularity_set is None or ident in popularity_set
|
||||||
|
|
||||||
@@ -177,18 +173,21 @@ def download(directory, url):
|
|||||||
try:
|
try:
|
||||||
lang, page_name = get_page_info(url)
|
lang, page_name = get_page_info(url)
|
||||||
except ParseError:
|
except ParseError:
|
||||||
log.exception("Parsing failed. {url} is incorrect.")
|
log.exception(f"Parsing failed. {url} is incorrect.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
path = os.path.join(directory, f"{lang}.html")
|
path = os.path.join(directory, f"{lang}.html")
|
||||||
if os.path.exists(path):
|
if os.path.exists(path):
|
||||||
log.warning(f"{path} already exists.")
|
log.debug(f"{path} already exists.")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
page = get_wiki_page(lang, page_name)
|
page = get_wiki_page(lang, page_name)
|
||||||
try:
|
try:
|
||||||
text = try_get(page, "text")
|
text = try_get(page, "text")
|
||||||
except GettingError:
|
except GettingError as e:
|
||||||
log.exception(f"Error: page is not downloaded {page_name}.")
|
log.exception(f"Error: page {page_name} is not downloaded for lang {lang} and url {url} ({e}).")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
page_size = len(text)
|
page_size = len(text)
|
||||||
if page_size > 0:
|
if page_size > 0:
|
||||||
os.makedirs(directory, exist_ok=True)
|
os.makedirs(directory, exist_ok=True)
|
||||||
@@ -198,15 +197,15 @@ def download(directory, url):
|
|||||||
file.write(text)
|
file.write(text)
|
||||||
else:
|
else:
|
||||||
log.warning(f"Page {url} is empty. It has not been saved.")
|
log.warning(f"Page {url} is empty. It has not been saved.")
|
||||||
|
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
def get_wiki_langs(url):
|
def get_wiki_langs(url):
|
||||||
lang, page_name = get_page_info(url)
|
lang, page_name = get_page_info(url)
|
||||||
page = get_wiki_page(lang, page_name)
|
page = get_wiki_page(lang, page_name)
|
||||||
curr_lang = [
|
|
||||||
(lang, url),
|
curr_lang = [(lang, url)]
|
||||||
]
|
|
||||||
try:
|
try:
|
||||||
langlinks = try_get(page, "langlinks")
|
langlinks = try_get(page, "langlinks")
|
||||||
return (
|
return (
|
||||||
@@ -214,7 +213,7 @@ def get_wiki_langs(url):
|
|||||||
+ curr_lang
|
+ curr_lang
|
||||||
)
|
)
|
||||||
except GettingError as e:
|
except GettingError as e:
|
||||||
log.warning(f"Error: no languages for {url} ({e}).")
|
log.exception(f"Error: no languages for page {page_name} with url {url} ({e}).")
|
||||||
return curr_lang
|
return curr_lang
|
||||||
|
|
||||||
|
|
||||||
@@ -230,12 +229,12 @@ def download_all_from_wikipedia(path, url, langs):
|
|||||||
|
|
||||||
|
|
||||||
def wikipedia_worker(output_dir, checker, langs):
|
def wikipedia_worker(output_dir, checker, langs):
|
||||||
@functools.wraps(wikipedia_worker)
|
|
||||||
def wrapped(line):
|
def wrapped(line):
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
mwm_path, ident, url = line.split("\t")
|
# First param is mwm_path, which added this line entry.
|
||||||
|
_, ident, url = line.split("\t")
|
||||||
ident = int(ident)
|
ident = int(ident)
|
||||||
if not checker(ident):
|
if not checker(ident):
|
||||||
return
|
return
|
||||||
@@ -252,11 +251,9 @@ def wikipedia_worker(output_dir, checker, langs):
|
|||||||
|
|
||||||
def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
|
def download_from_wikipedia_tags(input_file, output_dir, langs, checker):
|
||||||
with open(input_file) as file:
|
with open(input_file) as file:
|
||||||
_ = file.readline()
|
_ = file.readline() # skip header
|
||||||
pool = ThreadPool(processes=WORKERS)
|
with ThreadPoolExecutor(WORKERS) as pool:
|
||||||
pool.map(wikipedia_worker(output_dir, checker, langs), file, CHUNK_SIZE)
|
pool.map(wikipedia_worker(output_dir, checker, langs), file)
|
||||||
pool.close()
|
|
||||||
pool.join()
|
|
||||||
|
|
||||||
|
|
||||||
def get_wikidata_urls(entity, langs):
|
def get_wikidata_urls(entity, langs):
|
||||||
@@ -273,7 +270,6 @@ def get_wikidata_urls(entity, langs):
|
|||||||
|
|
||||||
|
|
||||||
def wikidata_worker(output_dir, checker, langs):
|
def wikidata_worker(output_dir, checker, langs):
|
||||||
@functools.wraps(wikidata_worker)
|
|
||||||
def wrapped(line):
|
def wrapped(line):
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
return
|
return
|
||||||
@@ -306,10 +302,8 @@ def download_from_wikidata_tags(input_file, output_dir, langs, checker):
|
|||||||
wikidata_output_dir = os.path.join(output_dir, "wikidata")
|
wikidata_output_dir = os.path.join(output_dir, "wikidata")
|
||||||
os.makedirs(wikidata_output_dir, exist_ok=True)
|
os.makedirs(wikidata_output_dir, exist_ok=True)
|
||||||
with open(input_file) as file:
|
with open(input_file) as file:
|
||||||
with ThreadPool(processes=WORKERS) as pool:
|
with ThreadPoolExecutor(WORKERS) as pool:
|
||||||
pool.map(
|
pool.map(wikidata_worker(wikidata_output_dir, checker, langs), file)
|
||||||
wikidata_worker(wikidata_output_dir, checker, langs), file, CHUNK_SIZE
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def check_and_get_checker(popularity_file):
|
def check_and_get_checker(popularity_file):
|
||||||
|
|||||||
Reference in New Issue
Block a user