pdf2book/texdict/texdict.py

259 lines
9.3 KiB
Python
Executable File

#!/usr/bin/env python3
import json
import argparse
parser = argparse.ArgumentParser(
prog='texdict',
description='convert LaTeX glossary to json',
epilog='onipa'
)
parser.add_argument('file', help='input file')
parser.add_argument('-i', '--input', choices=('json', 'tex'), help='input format: json, tex (default: json; sqlite might be added)')
parser.add_argument('-o', '--output', choices=('json', 'tex'), help='output format: json, tex (default: json; sqlite might be added)')
parser.add_argument('-v', '--verbose', action='store_true', help='boolean verbose switch (default: false)')
#parser.add_argument('-e', '--entry', help='add a entry to a file')
#parser.add_argument('-o', '--output', help='write dictionary file to file')
#parser.add_argument('-w', '--write', action='store_true', required=False, help='overwrite input file')
args = parser.parse_args()
# def verboseprint() https://stackoverflow.com/a/5980173
verboseprint = print if args.verbose else lambda *a, **k: None
inputfile=args.file
informat=inputfile.split('.')[-1]
#verboseprint(args.input)
if args.input:
informat=args.input
verboseprint("INFO: input file format:", informat)
outformat='json'
if args.output:
outformat=args.output
verboseprint("INFO: output file format:", outformat)
# KNOWN WORD TYPES
shorttypes={
'noun':'n'
}
shorttypes_inversed= {v: k for k, v in shorttypes.items()}
verboseprint("INFO: Word types:",shorttypes)
#
# READ JSON FILE
def get_data_from_jsonfile(filename):
verboseprint("INFO: opening file:", filename)
with open(filename) as infile:
data = json.load(infile)
verboseprint("INFO:", json.dumps(data, ensure_ascii=False))
return json.dumps(data, ensure_ascii=False)
def find_matching_parens(s, braces=None):
openers = braces or {"{": "}"}
closers = {v: k for k, v in openers.items()}
stack = []
result = []
for i, c in enumerate(s):
if c in openers:
stack.append([c, i])
elif c in closers:
if not stack:
raise ValueError(f"tried to close brace without an open at position {i}")
pair, idx = stack.pop()
result.append([idx, i])
if pair != closers[c]:
raise ValueError(f"mismatched brace at position {i}")
if stack:
raise ValueError(f"no closing brace at position {i}")
return result
# READ TEX FILE
def get_data_from_texfile(filename):
# # delimiters
# meaning_delim='\m '
# type_delim=['(',') ']
# description_delim=', '
# examples_delim='; '
# example_delim=', ' # what if sentence has komma? maybe doch use '. ' what if example is a question???
# entry_command='\dictentrysorted'
# verboseprint("INFO: opening file:", filename)
# with open(filename) as infile:
# oldline=''
# i=0
# jsonstring='{ "entries" : [ '
# for line in infile:
# i+=1
# line=line.strip()
# line=line.split('%')[0]
# if line.startswith('%'):
# continue
# if line == '\n':
# continue
# if oldline != '':
# if line.startswith(entry_command):
# raise ValueError(f"new entry starts at line {k}, but previous entry not closed!")
# else:
# line=oldline+line
#
# try:
# matches=find_matching_parens(line)
# split_indices=[
# matches[0][0], matches[0][1],
# matches[1][0], matches[1][1],
# matches[-1][0], matches[-1][1]
# ]
# substrings=[line[start:end] for start, end in zip([0] + split_indices, split_indices + [None])]
# entry=substrings[1][1:]
# key=substring[3][1:]
# meanings=substrings[5][1:]
#
# #end
# oldline=''
# except:
# oldline=line
# continue
# #ENTRY
# jsonstring+='{"entry": "'
# jsonstring+=entry
# jsonstring+='", '
# #KEY
# jsonstring+='"key": "'
# jsonstring+=key
# jsonstring+='", meanings": [ '
# #MEANINGS
# j=0
# for meaning in meanings.split('\m'):
# j+=1
# #WORD TYPE
# #TODO what if no word type???
# type_matches=find_matching_parens(meaning, {'(':')'})
# type_split_indices=[0]
# meaning_substrings=[meaning[start:end] for start, end in zip([0] + type_split_indices, type_split_indices + [None])]
# word_type=meaning_substrings[1][1:]
# jsonstring+=' { "type": "'
# jsonstring+=word_type
# jsonstring+='", '
# # DESCRIPTION
# descriptions=meaning_substrings[2][2:].split(examples_delim)[0]
# jsonstring+='"description": "'
# jsonstring+=description
# jsonstring+='", '
# # EXAMPLES
# # only execute if try:
# # except: "examples" : []
# examples_raw=meaning_substrings[2][2:].split(examples_delim)[1]
# examples=examples.split(example_delim)
# jsonstring+='"examples": ['
# k=0
# for example in examples:
# k+=1
# jsonstring+='"'
# jsonstring+=example
# jsonstring+='", '
# jsonstring+=']????'
#OLD VERSION
verboseprint("INFO: opening file:", filename)
with open(filename) as infile:
jsonstring='{ "entries" : [ '
j=0
for line in infile:
if line == '\n':
continue
if line.startswith('%'):
continue
if j!=0:
jsonstring+=', '
j+=1
jsonstring+='{"entry": "'
#ENTRY
jsonstring+=line.split('{')[1].split('}')[0]+'", '
#KEY
jsonstring+='"key": "'
jsonstring+=line.split('{')[2].split('}')[0]+'", '
#MEANINGS
number_of_meanings=line.count('\m')
verboseprint("INFO: Meanings found:", number_of_meanings)
jsonstring+='"meanings": [ '
for i in range(number_of_meanings):
#MEANING
verboseprint("INFO: number of meanins", i)
meaning=line.split("\m")[i+1][1:]
if i == (number_of_meanings-1):
meaning=meaning[:-2]
verboseprint("INFO: Meaning:",meaning)
#TYPE
#todo what if no type
if i == 0:
jsonstring+='{ "type": "'
else:
jsonstring+=', { "type": "'
short_wordtype=meaning.split("(")[1].split(")")[0]
wordtype=shorttypes_inversed[short_wordtype]
jsonstring+=wordtype+'", '
#DESCRIPTION
jsonstring+='"description": "'
jsonstring+=meaning.split(") ")[1].split(' ;')[0]+'"'
#EXAMPLES
jsonstring+=', "examples": ['
if ';' in meaning:
examples=meaning.split('; ')[1]
verboseprint("INFO: examples:", examples)
number_of_examples=len(examples.split(', '))
j=0
for example in examples.split(','):
j+=1
if j == number_of_examples:
jsonstring+='"'+example+'"'
else:
jsonstring+='"'+example+'", '
jsonstring+=' ]'
#CLOSE MEANING
jsonstring+='}'
jsonstring+=' ] }'
jsonstring+='] }'
return jsonstring
def json2tex(entries):
entries_json=json.loads(entries)
entries_tex=""
for entry_json in entries_json["entries"]:
# entry_json=json.loads(entry)
entry_tex='\dictentrysorted{'+entry_json['entry']+'}{'+entry_json['key']+'}{'
for meaning in entry_json['meanings']:
#TYPE
wordtype=meaning['type']
if wordtype in shorttypes:
wordtype=shorttypes[wordtype]
else:
print("VERBOSE: stderr: "+wordtype+" is not in the the known word type list!")
entry_tex+='\m ('+wordtype+') '
#DESCRIPTION
entry_tex+=meaning['description']+' '
#EXAMPLES
verboseprint("INFO:", meaning)
if len(meaning['examples']) != 0:
for example in meaning['examples']:
entry_tex+='; '+example+' '
entry_tex+='}'
entries_tex+=entry_tex
entries_tex+="\n"
return entries_tex
#def tex2json(entry):
# entry_json=json.dumps(json.loads(entry), ensure_ascii=False)
# return entry_json
if __name__ == "__main__":
if informat == "json":
data = get_data_from_jsonfile(inputfile)
elif informat == 'tex':
data = get_data_from_texfile(inputfile)
if outformat == 'json':
print(data)
elif outformat == 'tex':
print(json2tex(data))
exit(0)