Skip to content

Instantly share code, notes, and snippets.

@tinytengu
Created August 30, 2023 13:11
Show Gist options
  • Save tinytengu/4a7d9cc49c3130783ab000631002cc3a to your computer and use it in GitHub Desktop.
Save tinytengu/4a7d9cc49c3130783ab000631002cc3a to your computer and use it in GitHub Desktop.
Python 3 file & text differences detection
import difflib
from dataclasses import dataclass, field
@dataclass(frozen=True)
class LineAdded:
line: str
content: str
@dataclass(frozen=True)
class LineRemoved:
line: str
content: str
@dataclass(frozen=True)
class LineChanged:
line: str
before: str
after: str
idxs: list[int]
@dataclass(frozen=True)
class TextDiffs:
added: list[LineAdded] = field(default_factory=list)
removed: list[LineRemoved] = field(default_factory=list)
changed: list[LineChanged] = field(default_factory=list)
@property
def all(self) -> list[LineAdded | LineRemoved | LineChanged]:
return self.added + self.removed + self.changed
def get_diff_positions(text: str) -> list[int]:
return [idx for idx, char in enumerate(text) if char in "+^"]
def get_text_diffs(source: str, new: str, line_sep: str = "\n") -> TextDiffs:
differ = difflib.Differ()
diffs = differ.compare(source.split(line_sep), new.split(line_sep))
line_idx = -1
lines = {}
for line in diffs:
code, text = line[:1], line[2:]
if (code == " " or code == "-") or (
code == "+" and lines[line_idx][0]["code"] == " "
):
line_idx += 1
if line_idx not in lines:
lines[line_idx] = []
lines[line_idx].append({"code": code, "text": text})
changes = TextDiffs()
for line_idx, data in lines.items():
if len(data) > 1:
if data[0]["code"] == "-" and data[1]["code"] == "?":
changes.changed.append(
LineChanged(
line=line_idx,
before=data[0]["text"],
after=data[2]["text"],
idxs=get_diff_positions(data[3]["text"]),
)
)
continue
if data[0]["code"] == "-":
changes.removed.append(LineRemoved(line=line_idx, content=data[0]["text"]))
if data[0]["code"] == "+":
changes.added.append(LineAdded(line=line_idx, content=data[0]["text"]))
return changes
def get_file_diffs(source: str, new: str) -> TextDiffs:
with open(source, "rt", encoding="utf-8") as file:
content1 = file.read()
with open(new, "rt", encoding="utf-8") as file:
content2 = file.read()
return get_text_diffs(content1, content2)
changes = get_file_diffs(
source=r"C:\file1.csv",
new=r"C:\file2.csv",
)
print(changes.all)
@tinytengu
Copy link
Author

import difflib
import pandas as pd
from dataclasses import dataclass, field


@dataclass(frozen=True)
class LineAdded:
    line: str
    content: str


@dataclass(frozen=True)
class LineRemoved:
    line: str
    content: str


@dataclass(frozen=True)
class LineChanged:
    line: str
    before: str
    after: str
    idxs: list[int]


@dataclass(frozen=True)
class TextDiffs:
    added: list[LineAdded] = field(default_factory=list)
    removed: list[LineRemoved] = field(default_factory=list)
    changed: list[LineChanged] = field(default_factory=list)

    @property
    def all(self) -> list[LineAdded | LineRemoved | LineChanged]:
        return self.added + self.removed + self.changed


def get_diff_positions(text: str) -> list[int]:
    return [idx for idx, char in enumerate(text) if char in "+^"]


def get_text_diffs(source: str, new: str, line_sep: str = "\n") -> TextDiffs:
    differ = difflib.Differ()
    diffs = differ.compare(source.split(line_sep), new.split(line_sep))

    line_idx = -1
    lines = {}

    for line in diffs:
        code, text = line[:1], line[2:]

        if (code == " " or code == "-") or (
            code == "+" and lines[line_idx][0]["code"] == " "
        ):
            line_idx += 1

        if line_idx not in lines:
            lines[line_idx] = []

        lines[line_idx].append({"code": code, "text": text})

    changes = TextDiffs()

    for line_idx, data in lines.items():
        if len(data) > 1:
            new_text = ""
            idxs = []

            if data[0]["code"] != "-":
                continue

            for entry in data:
                if entry["code"] == "+":
                    new_text = entry["text"]
                if entry["code"] == "?":
                    idxs = get_diff_positions(entry["text"])

            changes.changed.append(
                LineChanged(
                    line=line_idx, before=data[0]["text"], after=new_text, idxs=idxs
                )
            )
        elif data[0]["code"] == "-":
            changes.removed.append(LineRemoved(line=line_idx, content=data[0]["text"]))
        elif data[0]["code"] == "+":
            changes.added.append(LineAdded(line=line_idx, content=data[0]["text"]))

    return changes


def get_excel_diffs(source: str, new: str, sheet_name: str) -> TextDiffs:
    content1 = pd.read_excel(source, sheet_name).to_csv(
        index=False, sep="|", lineterminator="\n", float_format="%.0f"
    )

    content2 = pd.read_excel(new, sheet_name).to_csv(
        index=False, sep="|", lineterminator="\n", float_format="%.0f"
    )
    return get_text_diffs(content1, content2)


def get_file_diffs(source: str, new: str) -> TextDiffs:
    with open(source, "rt", encoding="utf-8") as file:
        content1 = file.read()

    with open(new, "rt", encoding="utf-8") as file:
        content2 = file.read()

    return get_text_diffs(content1, content2)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment