← Back to Workflow
Hidden

track_changes_exporter.py

Helper script for the track_changes workflow.

import sys, os, datetime, re
import diff_match_patch as dmp_module

try:
    import docx
    from docx.oxml import OxmlElement
    from docx.oxml.ns import qn
except ImportError:
    sys.exit("Install python-docx")

def add_run(p, text, tag, rid):
    if not text: return rid
    r = OxmlElement(f'w:{tag}')
    r.set(qn('w:id'), str(rid))
    r.set(qn('w:author'), 'Gemini')
    r.set(qn('w:date'), datetime.datetime.now().isoformat() + 'Z')
    t = OxmlElement('w:delText' if tag == 'del' else 'w:t')
    t.set(qn('xml:space'), 'preserve')
    t.text = text
    wr = OxmlElement('w:r')
    wr.append(t)
    r.append(wr)
    p._p.append(r)
    return rid + 1

def count_alphanumeric_words(text: str) -> int:
    return len(re.findall(r'\w+(?:[\'-]\w+)*', text))

def _merge_and_order_diffs(diffs: list) -> list:
    merged_diffs = []
    current_del = []
    current_ins = []
    for op, text in diffs:
        if op == -1:
            current_del.append(text)
        elif op == 1:
            current_ins.append(text)
        else:
            if current_del:
                merged_diffs.append((-1, "".join(current_del)))
                current_del = []
            if current_ins:
                merged_diffs.append((1, "".join(current_ins)))
                current_ins = []
            if merged_diffs and merged_diffs[-1][0] == 0:
                merged_diffs[-1] = (0, merged_diffs[-1][1] + text)
            else:
                merged_diffs.append((0, text))
    if current_del:
        merged_diffs.append((-1, "".join(current_del)))
    if current_ins:
        merged_diffs.append((1, "".join(current_ins)))
    return merged_diffs

def clean_word_salad(diffs: list) -> list:
    changed = True
    while changed:
        changed = False
        diffs = _merge_and_order_diffs(diffs)
        for i in range(len(diffs)):
            if diffs[i][0] == 0:
                E = count_alphanumeric_words(diffs[i][1])
                left_edits = 0
                j = i - 1
                while j >= 0 and diffs[j][0] != 0:
                    left_edits += count_alphanumeric_words(diffs[j][1])
                    j -= 1
                right_edits = 0
                k = i + 1
                while k < len(diffs) and diffs[k][0] != 0:
                    right_edits += count_alphanumeric_words(diffs[k][1])
                    k += 1
                if left_edits > 0 and right_edits > 0:
                    L = left_edits
                    R = right_edits
                    is_spurious = False
                    if E == 0: is_spurious = True
                    elif E == 1 and L >= 1 and R >= 1: is_spurious = True
                    elif E == 2 and L >= 2 and R >= 2: is_spurious = True
                    elif E == 3 and L >= 3 and R >= 3: is_spurious = True
                    if is_spurious:
                        text = diffs[i][1]
                        diffs[i] = (-1, text)
                        diffs.insert(i+1, (1, text))
                        changed = True
                        break
    return _merge_and_order_diffs(diffs)

def generate_human_track_changes(text1: str, text2: str) -> list:
    dmp = dmp_module.diff_match_patch()
    dmp.Diff_Timeout = 0 
    tokens1 = re.findall(r'\w+(?:[\'-]\w+)*|\s+|[^\w\s]+', text1)
    tokens2 = re.findall(r'\w+(?:[\'-]\w+)*|\s+|[^\w\s]+', text2)
    token_to_char = {}
    char_to_token = {}
    next_char_code = 0x100000
    def tokens_to_string(tokens):
        nonlocal next_char_code
        chars = []
        for token in tokens:
            if token not in token_to_char:
                char = chr(next_char_code)
                token_to_char[token] = char
                char_to_token[char] = token
                next_char_code += 1
            chars.append(token_to_char[token])
        return "".join(chars)
    str1 = tokens_to_string(tokens1)
    str2 = tokens_to_string(tokens2)
    diffs = dmp.diff_main(str1, str2, False) 
    word_diffs = []
    for op, char_text in diffs:
        resolved_text = "".join([char_to_token[c] for c in char_text])
        word_diffs.append((op, resolved_text))
    return clean_word_salad(word_diffs)

def create_docx(orig, rev, out=None):
    if out is None:
        base, _ = os.path.splitext(rev)
        out = f"{base}_TC.docx"
        
    with open(orig, 'r', encoding='utf-8') as f: ol = f.read().replace('\r\n', '\n')
    with open(rev, 'r', encoding='utf-8') as f: rl = f.read().replace('\r\n', '\n')
    
    diffs = generate_human_track_changes(ol, rl)
    
    doc = docx.Document()
    doc.settings.element.append(OxmlElement('w:trackRevisions'))
    rid = 0
    p = doc.add_paragraph()
    
    for op, text in diffs:
        parts = text.split('\n')
        for i, part in enumerate(parts):
            if i > 0:
                p = doc.add_paragraph()
            if part:
                if op == 0: p.add_run(part)
                elif op == -1: rid = add_run(p, part, 'del', rid)
                elif op == 1: rid = add_run(p, part, 'ins', rid)
                    
    doc.save(out)

if __name__ == '__main__': 
    create_docx(sys.argv[1], sys.argv[2])

This is used in: