track_changes_exporter.py
Helper script for the track_changes workflow.
import sys, os, datetime, re
import diff_match_patch as dmp_module
try:
import docx
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
except ImportError:
sys.exit("Install python-docx")
def add_run(p, text, tag, rid):
if not text: return rid
r = OxmlElement(f'w:{tag}')
r.set(qn('w:id'), str(rid))
r.set(qn('w:author'), 'Gemini')
r.set(qn('w:date'), datetime.datetime.now().isoformat() + 'Z')
t = OxmlElement('w:delText' if tag == 'del' else 'w:t')
t.set(qn('xml:space'), 'preserve')
t.text = text
wr = OxmlElement('w:r')
wr.append(t)
r.append(wr)
p._p.append(r)
return rid + 1
def count_alphanumeric_words(text: str) -> int:
return len(re.findall(r'\w+(?:[\'-]\w+)*', text))
def _merge_and_order_diffs(diffs: list) -> list:
merged_diffs = []
current_del = []
current_ins = []
for op, text in diffs:
if op == -1:
current_del.append(text)
elif op == 1:
current_ins.append(text)
else:
if current_del:
merged_diffs.append((-1, "".join(current_del)))
current_del = []
if current_ins:
merged_diffs.append((1, "".join(current_ins)))
current_ins = []
if merged_diffs and merged_diffs[-1][0] == 0:
merged_diffs[-1] = (0, merged_diffs[-1][1] + text)
else:
merged_diffs.append((0, text))
if current_del:
merged_diffs.append((-1, "".join(current_del)))
if current_ins:
merged_diffs.append((1, "".join(current_ins)))
return merged_diffs
def clean_word_salad(diffs: list) -> list:
changed = True
while changed:
changed = False
diffs = _merge_and_order_diffs(diffs)
for i in range(len(diffs)):
if diffs[i][0] == 0:
E = count_alphanumeric_words(diffs[i][1])
left_edits = 0
j = i - 1
while j >= 0 and diffs[j][0] != 0:
left_edits += count_alphanumeric_words(diffs[j][1])
j -= 1
right_edits = 0
k = i + 1
while k < len(diffs) and diffs[k][0] != 0:
right_edits += count_alphanumeric_words(diffs[k][1])
k += 1
if left_edits > 0 and right_edits > 0:
L = left_edits
R = right_edits
is_spurious = False
if E == 0: is_spurious = True
elif E == 1 and L >= 1 and R >= 1: is_spurious = True
elif E == 2 and L >= 2 and R >= 2: is_spurious = True
elif E == 3 and L >= 3 and R >= 3: is_spurious = True
if is_spurious:
text = diffs[i][1]
diffs[i] = (-1, text)
diffs.insert(i+1, (1, text))
changed = True
break
return _merge_and_order_diffs(diffs)
def generate_human_track_changes(text1: str, text2: str) -> list:
dmp = dmp_module.diff_match_patch()
dmp.Diff_Timeout = 0
tokens1 = re.findall(r'\w+(?:[\'-]\w+)*|\s+|[^\w\s]+', text1)
tokens2 = re.findall(r'\w+(?:[\'-]\w+)*|\s+|[^\w\s]+', text2)
token_to_char = {}
char_to_token = {}
next_char_code = 0x100000
def tokens_to_string(tokens):
nonlocal next_char_code
chars = []
for token in tokens:
if token not in token_to_char:
char = chr(next_char_code)
token_to_char[token] = char
char_to_token[char] = token
next_char_code += 1
chars.append(token_to_char[token])
return "".join(chars)
str1 = tokens_to_string(tokens1)
str2 = tokens_to_string(tokens2)
diffs = dmp.diff_main(str1, str2, False)
word_diffs = []
for op, char_text in diffs:
resolved_text = "".join([char_to_token[c] for c in char_text])
word_diffs.append((op, resolved_text))
return clean_word_salad(word_diffs)
def create_docx(orig, rev, out=None):
if out is None:
base, _ = os.path.splitext(rev)
out = f"{base}_TC.docx"
with open(orig, 'r', encoding='utf-8') as f: ol = f.read().replace('\r\n', '\n')
with open(rev, 'r', encoding='utf-8') as f: rl = f.read().replace('\r\n', '\n')
diffs = generate_human_track_changes(ol, rl)
doc = docx.Document()
doc.settings.element.append(OxmlElement('w:trackRevisions'))
rid = 0
p = doc.add_paragraph()
for op, text in diffs:
parts = text.split('\n')
for i, part in enumerate(parts):
if i > 0:
p = doc.add_paragraph()
if part:
if op == 0: p.add_run(part)
elif op == -1: rid = add_run(p, part, 'del', rid)
elif op == 1: rid = add_run(p, part, 'ins', rid)
doc.save(out)
if __name__ == '__main__':
create_docx(sys.argv[1], sys.argv[2])