texdict.py: get_data_from_texfile() cleaner rewrite attempt

2025-07-21 22:24:25 +02:00 · 2025-07-21 22:24:25 +02:00 · c08442f2ea
commit c08442f2ea
parent 64de297467
2 changed files with 143 additions and 12 deletions
--- a/texdict/texdict.py
+++ b/texdict/texdict.py
@ -45,8 +45,114 @@ def get_data_from_jsonfile(filename):
        verboseprint("INFO:", json.dumps(data, ensure_ascii=False))
        return json.dumps(data, ensure_ascii=False)

+def find_matching_parens(s, braces=None):
+    openers = braces or {"{": "}"}
+    closers = {v: k for k, v in openers.items()}
+    stack = []
+    result = []
+
+    for i, c in enumerate(s):
+        if c in openers:
+            stack.append([c, i])
+        elif c in closers:
+            if not stack:
+                raise ValueError(f"tried to close brace without an open at position {i}")
+            pair, idx = stack.pop()
+            result.append([idx, i])
+
+            if pair != closers[c]:
+                raise ValueError(f"mismatched brace at position {i}")
+    if stack:
+        raise ValueError(f"no closing brace at position {i}")
+    return result
+
 # READ TEX FILE
 def get_data_from_texfile(filename):
+#    # delimiters
+#    meaning_delim='\m '
+#    type_delim=['(',') ']
+#    description_delim=', '
+#    examples_delim='; '
+#    example_delim=', ' # what if sentence has komma? maybe doch use '. ' what if example is a question???
+#    entry_command='\dictentrysorted'
+#    verboseprint("INFO: opening file:", filename)
+#    with open(filename) as infile:
+#        oldline=''
+#        i=0
+#        jsonstring='{ "entries" : [ '
+#        for line in infile:
+#            i+=1
+#            line=line.strip()
+#            line=line.split('%')[0]
+#            if line.startswith('%'):
+#                continue
+#            if line == '\n':
+#                continue
+#            if oldline != '':
+#                if line.startswith(entry_command):
+#                    raise ValueError(f"new entry starts at line {k}, but previous entry not closed!")
+#                else: 
+#                    line=oldline+line
+#            
+#            try:
+#                matches=find_matching_parens(line)
+#                split_indices=[
+#                    matches[0][0], matches[0][1],
+#                    matches[1][0], matches[1][1],
+#                    matches[-1][0], matches[-1][1]
+#                ]
+#                substrings=[line[start:end] for start, end in zip([0] + split_indices, split_indices + [None])]
+#                entry=substrings[1][1:]
+#                key=substring[3][1:]
+#                meanings=substrings[5][1:]
+#
+#                #end 
+#                oldline=''
+#            except:
+#                oldline=line
+#                continue
+#            #ENTRY
+#            jsonstring+='{"entry": "'
+#            jsonstring+=entry
+#            jsonstring+='", '
+#            #KEY
+#            jsonstring+='"key": "'
+#            jsonstring+=key
+#            jsonstring+='", meanings": [ '
+#            #MEANINGS
+#            j=0
+#            for meaning in meanings.split('\m'):
+#                j+=1
+#                #WORD TYPE
+#                #TODO what if no word type???
+#                type_matches=find_matching_parens(meaning, {'(':')'})
+#                type_split_indices=[0]
+#                meaning_substrings=[meaning[start:end] for start, end in zip([0] + type_split_indices, type_split_indices + [None])]
+#                word_type=meaning_substrings[1][1:]
+#                jsonstring+=' { "type": "'
+#                jsonstring+=word_type
+#                jsonstring+='", '
+#                # DESCRIPTION
+#                descriptions=meaning_substrings[2][2:].split(examples_delim)[0]
+#                jsonstring+='"description": "'
+#                jsonstring+=description
+#                jsonstring+='", '
+#                # EXAMPLES
+#                # only execute if try: 
+#                # except: "examples" : [] 
+#                examples_raw=meaning_substrings[2][2:].split(examples_delim)[1]
+#                examples=examples.split(example_delim)
+#                jsonstring+='"examples": ['
+#                k=0
+#                for example in examples:
+#                    k+=1
+#                    jsonstring+='"'
+#                    jsonstring+=example
+#                    jsonstring+='", '
+#                jsonstring+=']????'
+                    
+    
+    #OLD VERSION
    verboseprint("INFO: opening file:", filename)
    with open(filename) as infile:
        jsonstring='{ "entries" : [ '
@ -134,18 +240,19 @@ def json2tex(entries):
        entries_tex+="\n"
    return entries_tex

-def tex2json(entry):
-    entry_json=json.dumps(json.loads(entry), ensure_ascii=False)
-    return entry_json
+#def tex2json(entry):
+#    entry_json=json.dumps(json.loads(entry), ensure_ascii=False)
+#    return entry_json

-if informat == "json":
+if __name__ == "__main__":
+    if informat == "json":
        data = get_data_from_jsonfile(inputfile)
-elif informat == 'tex':
+    elif informat == 'tex':
        data = get_data_from_texfile(inputfile)

-if outformat == 'json':
+    if outformat == 'json':
        print(data)
-elif outformat == 'tex':
+    elif outformat == 'tex':
        print(json2tex(data))

-exit(0)
+    exit(0)
--- a/texdict/todo.md
+++ b/texdict/todo.md
@ -97,6 +97,21 @@ example.json
    ]
 }
 ```
+\glossary{entry}{key}{
+    \m (type) description, description, description ; example, example 
+    \m 
+}
+- tex2json
+```
+def tex2json(dict_tex)
+    meaning_delim='\m '
+    type_delim=['(',') ']
+    description_delim=', '
+    examples_delim='; '
+    example_delim=', '
+    # how to cope with multiline entries? 
+        - [find closing brackets](https://stackoverflow.com/q/63382152)
+```
 - notes: 
    - `type` could also include gender/genus? 
    - where should I put info like 'go, went, gone' or plural forms for some words?
@ -105,3 +120,12 @@ example.json
    - `dialect/usage/gebrauch`
        - https://www.duden.de/hilfe/gebrauch
    - statt `description` = `meaning`? 
+    - what about `see also`s and confere (cf.)
+    - what about `aussprache`?
+
+```
+\glossary{entry}{key}{
+    \m (type) description, description, description ; example, example 
+    \m 
+}
+- tex2json