texdict.py: get_data_from_texfile() cleaner rewrite attempt

This commit is contained in:
uni@bor.mac 2025-07-21 22:24:25 +02:00
parent 64de297467
commit c08442f2ea
2 changed files with 143 additions and 12 deletions

View File

@ -45,8 +45,114 @@ def get_data_from_jsonfile(filename):
verboseprint("INFO:", json.dumps(data, ensure_ascii=False)) verboseprint("INFO:", json.dumps(data, ensure_ascii=False))
return json.dumps(data, ensure_ascii=False) return json.dumps(data, ensure_ascii=False)
def find_matching_parens(s, braces=None):
openers = braces or {"{": "}"}
closers = {v: k for k, v in openers.items()}
stack = []
result = []
for i, c in enumerate(s):
if c in openers:
stack.append([c, i])
elif c in closers:
if not stack:
raise ValueError(f"tried to close brace without an open at position {i}")
pair, idx = stack.pop()
result.append([idx, i])
if pair != closers[c]:
raise ValueError(f"mismatched brace at position {i}")
if stack:
raise ValueError(f"no closing brace at position {i}")
return result
# READ TEX FILE # READ TEX FILE
def get_data_from_texfile(filename): def get_data_from_texfile(filename):
# # delimiters
# meaning_delim='\m '
# type_delim=['(',') ']
# description_delim=', '
# examples_delim='; '
# example_delim=', ' # what if sentence has komma? maybe doch use '. ' what if example is a question???
# entry_command='\dictentrysorted'
# verboseprint("INFO: opening file:", filename)
# with open(filename) as infile:
# oldline=''
# i=0
# jsonstring='{ "entries" : [ '
# for line in infile:
# i+=1
# line=line.strip()
# line=line.split('%')[0]
# if line.startswith('%'):
# continue
# if line == '\n':
# continue
# if oldline != '':
# if line.startswith(entry_command):
# raise ValueError(f"new entry starts at line {k}, but previous entry not closed!")
# else:
# line=oldline+line
#
# try:
# matches=find_matching_parens(line)
# split_indices=[
# matches[0][0], matches[0][1],
# matches[1][0], matches[1][1],
# matches[-1][0], matches[-1][1]
# ]
# substrings=[line[start:end] for start, end in zip([0] + split_indices, split_indices + [None])]
# entry=substrings[1][1:]
# key=substring[3][1:]
# meanings=substrings[5][1:]
#
# #end
# oldline=''
# except:
# oldline=line
# continue
# #ENTRY
# jsonstring+='{"entry": "'
# jsonstring+=entry
# jsonstring+='", '
# #KEY
# jsonstring+='"key": "'
# jsonstring+=key
# jsonstring+='", meanings": [ '
# #MEANINGS
# j=0
# for meaning in meanings.split('\m'):
# j+=1
# #WORD TYPE
# #TODO what if no word type???
# type_matches=find_matching_parens(meaning, {'(':')'})
# type_split_indices=[0]
# meaning_substrings=[meaning[start:end] for start, end in zip([0] + type_split_indices, type_split_indices + [None])]
# word_type=meaning_substrings[1][1:]
# jsonstring+=' { "type": "'
# jsonstring+=word_type
# jsonstring+='", '
# # DESCRIPTION
# descriptions=meaning_substrings[2][2:].split(examples_delim)[0]
# jsonstring+='"description": "'
# jsonstring+=description
# jsonstring+='", '
# # EXAMPLES
# # only execute if try:
# # except: "examples" : []
# examples_raw=meaning_substrings[2][2:].split(examples_delim)[1]
# examples=examples.split(example_delim)
# jsonstring+='"examples": ['
# k=0
# for example in examples:
# k+=1
# jsonstring+='"'
# jsonstring+=example
# jsonstring+='", '
# jsonstring+=']????'
#OLD VERSION
verboseprint("INFO: opening file:", filename) verboseprint("INFO: opening file:", filename)
with open(filename) as infile: with open(filename) as infile:
jsonstring='{ "entries" : [ ' jsonstring='{ "entries" : [ '
@ -134,18 +240,19 @@ def json2tex(entries):
entries_tex+="\n" entries_tex+="\n"
return entries_tex return entries_tex
def tex2json(entry): #def tex2json(entry):
entry_json=json.dumps(json.loads(entry), ensure_ascii=False) # entry_json=json.dumps(json.loads(entry), ensure_ascii=False)
return entry_json # return entry_json
if informat == "json": if __name__ == "__main__":
data = get_data_from_jsonfile(inputfile) if informat == "json":
elif informat == 'tex': data = get_data_from_jsonfile(inputfile)
data = get_data_from_texfile(inputfile) elif informat == 'tex':
data = get_data_from_texfile(inputfile)
if outformat == 'json': if outformat == 'json':
print(data) print(data)
elif outformat == 'tex': elif outformat == 'tex':
print(json2tex(data)) print(json2tex(data))
exit(0) exit(0)

View File

@ -97,6 +97,21 @@ example.json
] ]
} }
``` ```
\glossary{entry}{key}{
\m (type) description, description, description ; example, example
\m
}
- tex2json
```
def tex2json(dict_tex)
meaning_delim='\m '
type_delim=['(',') ']
description_delim=', '
examples_delim='; '
example_delim=', '
# how to cope with multiline entries?
- [find closing brackets](https://stackoverflow.com/q/63382152)
```
- notes: - notes:
- `type` could also include gender/genus? - `type` could also include gender/genus?
- where should I put info like 'go, went, gone' or plural forms for some words? - where should I put info like 'go, went, gone' or plural forms for some words?
@ -105,3 +120,12 @@ example.json
- `dialect/usage/gebrauch` - `dialect/usage/gebrauch`
- https://www.duden.de/hilfe/gebrauch - https://www.duden.de/hilfe/gebrauch
- statt `description` = `meaning`? - statt `description` = `meaning`?
- what about `see also`s and confere (cf.)
- what about `aussprache`?
```
\glossary{entry}{key}{
\m (type) description, description, description ; example, example
\m
}
- tex2json