pdf2book/texdict/texdict.py

#!/usr/bin/env python3

import json
import argparse
parser = argparse.ArgumentParser(
    prog='texdict',
    description='convert LaTeX glossary to json',
    epilog='onipa'
)

parser.add_argument('file', help='input file')
parser.add_argument('-i', '--input', choices=('json', 'tex'), help='input format: json, tex (default: json; sqlite might be added)')
parser.add_argument('-o', '--output', choices=('json', 'tex'), help='output format: json, tex (default: json; sqlite might be added)')
parser.add_argument('-v', '--verbose', action='store_true', help='boolean verbose switch (default: false)')
#parser.add_argument('-e', '--entry', help='add a entry to a file')
#parser.add_argument('-o', '--output', help='write dictionary file to file')
#parser.add_argument('-w', '--write', action='store_true', required=False, help='overwrite input file')

args = parser.parse_args()
# def verboseprint() https://stackoverflow.com/a/5980173
verboseprint = print if args.verbose else lambda *a, **k: None
inputfile=args.file
informat=inputfile.split('.')[-1]
#verboseprint(args.input)
if args.input:
    informat=args.input
verboseprint("INFO: input file format:", informat)
outformat='json'
if args.output:
    outformat=args.output
verboseprint("INFO: output file format:", outformat)

# KNOWN WORD TYPES
shorttypes={
        'noun':'n'
        }
shorttypes_inversed= {v: k for k, v in shorttypes.items()}
verboseprint("INFO: Word types:",shorttypes)
#
# READ JSON FILE
def get_data_from_jsonfile(filename):
    verboseprint("INFO: opening file:", filename)
    with open(filename) as infile:
        data = json.load(infile)
        verboseprint("INFO:", json.dumps(data, ensure_ascii=False))
        return json.dumps(data, ensure_ascii=False)

def find_matching_parens(s, braces=None):
    openers = braces or {"{": "}"}
    closers = {v: k for k, v in openers.items()}
    stack = []
    result = []

    for i, c in enumerate(s):
        if c in openers:
            stack.append([c, i])
        elif c in closers:
            if not stack:
                raise ValueError(f"tried to close brace without an open at position {i}")
            pair, idx = stack.pop()
            result.append([idx, i])

            if pair != closers[c]:
                raise ValueError(f"mismatched brace at position {i}")
    if stack:
        raise ValueError(f"no closing brace at position {i}")
    return result

# READ TEX FILE
def get_data_from_texfile(filename):
#    # delimiters
#    meaning_delim='\m '
#    type_delim=['(',') ']
#    description_delim=', '
#    examples_delim='; '
#    example_delim=', ' # what if sentence has komma? maybe doch use '. ' what if example is a question???
#    entry_command='\dictentrysorted'
#    verboseprint("INFO: opening file:", filename)
#    with open(filename) as infile:
#        oldline=''
#        i=0
#        jsonstring='{ "entries" : [ '
#        for line in infile:
#            i+=1
#            line=line.strip()
#            line=line.split('%')[0]
#            if line.startswith('%'):
#                continue
#            if line == '\n':
#                continue
#            if oldline != '':
#                if line.startswith(entry_command):
#                    raise ValueError(f"new entry starts at line {k}, but previous entry not closed!")
#                else:
#                    line=oldline+line
#
#            try:
#                matches=find_matching_parens(line)
#                split_indices=[
#                    matches[0][0], matches[0][1],
#                    matches[1][0], matches[1][1],
#                    matches[-1][0], matches[-1][1]
#                ]
#                substrings=[line[start:end] for start, end in zip([0] + split_indices, split_indices + [None])]
#                entry=substrings[1][1:]
#                key=substring[3][1:]
#                meanings=substrings[5][1:]
#
#                #end
#                oldline=''
#            except:
#                oldline=line
#                continue
#            #ENTRY
#            jsonstring+='{"entry": "'
#            jsonstring+=entry
#            jsonstring+='", '
#            #KEY
#            jsonstring+='"key": "'
#            jsonstring+=key
#            jsonstring+='", meanings": [ '
#            #MEANINGS
#            j=0
#            for meaning in meanings.split('\m'):
#                j+=1
#                #WORD TYPE
#                #TODO what if no word type???
#                type_matches=find_matching_parens(meaning, {'(':')'})
#                type_split_indices=[0]
#                meaning_substrings=[meaning[start:end] for start, end in zip([0] + type_split_indices, type_split_indices + [None])]
#                word_type=meaning_substrings[1][1:]
#                jsonstring+=' { "type": "'
#                jsonstring+=word_type
#                jsonstring+='", '
#                # DESCRIPTION
#                descriptions=meaning_substrings[2][2:].split(examples_delim)[0]
#                jsonstring+='"description": "'
#                jsonstring+=description
#                jsonstring+='", '
#                # EXAMPLES
#                # only execute if try:
#                # except: "examples" : []
#                examples_raw=meaning_substrings[2][2:].split(examples_delim)[1]
#                examples=examples.split(example_delim)
#                jsonstring+='"examples": ['
#                k=0
#                for example in examples:
#                    k+=1
#                    jsonstring+='"'
#                    jsonstring+=example
#                    jsonstring+='", '
#                jsonstring+=']????'


    #OLD VERSION
    verboseprint("INFO: opening file:", filename)
    with open(filename) as infile:
        jsonstring='{ "entries" : [ '
        j=0
        for line in infile:
            if line == '\n':
                continue
            if line.startswith('%'):
                continue
            if j!=0:
                jsonstring+=', '
            j+=1
            jsonstring+='{"entry": "'
            #ENTRY
            jsonstring+=line.split('{')[1].split('}')[0]+'", '
            #KEY
            jsonstring+='"key": "'
            jsonstring+=line.split('{')[2].split('}')[0]+'", '
            #MEANINGS
            number_of_meanings=line.count('\m')
            verboseprint("INFO: Meanings found:", number_of_meanings)
            jsonstring+='"meanings": [ '
            for i in range(number_of_meanings):
                #MEANING
                verboseprint("INFO: number of meanins", i)
                meaning=line.split("\m")[i+1][1:]
                if i == (number_of_meanings-1):
                    meaning=meaning[:-2]
                verboseprint("INFO: Meaning:",meaning)
                #TYPE
                #todo what if no type
                if i == 0:
                    jsonstring+='{ "type": "'
                else:
                    jsonstring+=', { "type": "'
                short_wordtype=meaning.split("(")[1].split(")")[0]
                wordtype=shorttypes_inversed[short_wordtype]
                jsonstring+=wordtype+'", '
                #DESCRIPTION
                jsonstring+='"description": "'
                jsonstring+=meaning.split(") ")[1].split(' ;')[0]+'"'
                #EXAMPLES
                jsonstring+=', "examples": ['
                if ';' in meaning:
                    examples=meaning.split('; ')[1]
                    verboseprint("INFO: examples:", examples)
                    number_of_examples=len(examples.split(', '))
                    j=0
                    for example in examples.split(','):
                        j+=1
                        if j == number_of_examples:
                            jsonstring+='"'+example+'"'
                        else:
                            jsonstring+='"'+example+'", '
                jsonstring+=' ]'
                #CLOSE MEANING
                jsonstring+='}'
            jsonstring+=' ] }'
        jsonstring+='] }'
    return jsonstring

def json2tex(entries):
    entries_json=json.loads(entries)
    entries_tex=""
    for entry_json in entries_json["entries"]:
#        entry_json=json.loads(entry)
        entry_tex='\dictentrysorted{'+entry_json['entry']+'}{'+entry_json['key']+'}{'
        for meaning in entry_json['meanings']:
            #TYPE
            wordtype=meaning['type']
            if wordtype in shorttypes:
                wordtype=shorttypes[wordtype]
            else:
                print("VERBOSE: stderr: "+wordtype+" is not in the the known word type list!")
            entry_tex+='\m ('+wordtype+') '
            #DESCRIPTION
            entry_tex+=meaning['description']+' '
            #EXAMPLES
            verboseprint("INFO:", meaning)
            if len(meaning['examples']) != 0:
                for example in meaning['examples']:
                    entry_tex+='; '+example+' '
        entry_tex+='}'
        entries_tex+=entry_tex
        entries_tex+="\n"
    return entries_tex

#def tex2json(entry):
#    entry_json=json.dumps(json.loads(entry), ensure_ascii=False)
#    return entry_json

if __name__ == "__main__":
    if informat == "json":
        data = get_data_from_jsonfile(inputfile)
    elif informat == 'tex':
        data = get_data_from_texfile(inputfile)

    if outformat == 'json':
        print(data)
    elif outformat == 'tex':
        print(json2tex(data))

    exit(0)