Files
comaps/tools/category_consistency/omim_parsers.rb
Alexander Borsuk 76ffc99abd New cpp folder structure
Signed-off-by: Alexander Borsuk <me@alex.bio>
2025-08-14 20:52:04 +07:00

105 lines
2.5 KiB
Ruby

module OmimParsers
# To update the list, run in root directory:
# sed -nEe "s/ +([a-zA-Z]{2}(-[a-zA-Z]{2,})?) = .*$/\1/p" "data/strings/strings.txt" | sort -u | tr '\n' ' ' | sed -e 's/,$//' | fold -s -w48; echo
LANGUAGES = %w(af ar be bg ca cs da de el en en-GB es es-MX et
eu fa fi fr fr-CA he hi hu id it ja ko lt mr nb
nl pl pt pt-BR ro ru sk sv sw th tr uk vi
zh-Hans zh-Hant)
class AbstractParser
def initialize(keys)
@keys = keys
end
def parse_line(line)
raise NotImplementedError.new("You must implement parse_file.")
end
def match_category(line, result)
category_match = category.match(line)
if !category_match.nil?
category = category_match[1]
if @keys.include? category
result[category] ||= {}
end
end
end
def parse_file(filename)
current_string = nil
result = {}
File.open(filename, 'r:UTF-8').each do |line|
line.strip!
next if should_exclude_line? line
# If line is empty -> next category block started
if line.empty?
current_string = nil
next
end
current_string ||= match_category(line, result)
parsed = parse_line(line)
if !parsed.nil? and !current_string.nil?
lang, translation = parsed
current_string[lang] = translation
end
end
result
end
def category
raise NotImplementedError.new("You must implement category.")
end
def should_exclude_line?(line)
false
end
end
class CategoriesParser < AbstractParser
def parse_line(line)
line_match = /^([^:]+):(\S+)$/u.match(line)
return if !line_match
lang = $1.strip
return if !LANGUAGES.include? lang
translation = $2.strip
synonyms = []
translation.split('|').each do |token|
token_match = /\d?\^?(.*)$/.match(token)
synonyms.push(token_match[1]) if token_match
end
[lang, synonyms]
end
def should_exclude_line?(line)
line.start_with? '#'
end
def category
# We match only global categories ('food', 'bank'...)
/^@([A-Za-z0-9]+)$/
end
end
class StringsParser < AbstractParser
def parse_line(line)
line_match = /^([^=]+)=(.*)$/.match(line)
if line_match
lang = $1.strip
if LANGUAGES.include? lang
[lang, $2.strip]
end
end
end
def category
/^\[(.+)\]/
end
end
end