Created
August 30, 2023 13:11
-
-
Save tinytengu/4a7d9cc49c3130783ab000631002cc3a to your computer and use it in GitHub Desktop.
Python 3 file & text differences detection
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import difflib | |
from dataclasses import dataclass, field | |
@dataclass(frozen=True) | |
class LineAdded: | |
line: str | |
content: str | |
@dataclass(frozen=True) | |
class LineRemoved: | |
line: str | |
content: str | |
@dataclass(frozen=True) | |
class LineChanged: | |
line: str | |
before: str | |
after: str | |
idxs: list[int] | |
@dataclass(frozen=True) | |
class TextDiffs: | |
added: list[LineAdded] = field(default_factory=list) | |
removed: list[LineRemoved] = field(default_factory=list) | |
changed: list[LineChanged] = field(default_factory=list) | |
@property | |
def all(self) -> list[LineAdded | LineRemoved | LineChanged]: | |
return self.added + self.removed + self.changed | |
def get_diff_positions(text: str) -> list[int]: | |
return [idx for idx, char in enumerate(text) if char in "+^"] | |
def get_text_diffs(source: str, new: str, line_sep: str = "\n") -> TextDiffs: | |
differ = difflib.Differ() | |
diffs = differ.compare(source.split(line_sep), new.split(line_sep)) | |
line_idx = -1 | |
lines = {} | |
for line in diffs: | |
code, text = line[:1], line[2:] | |
if (code == " " or code == "-") or ( | |
code == "+" and lines[line_idx][0]["code"] == " " | |
): | |
line_idx += 1 | |
if line_idx not in lines: | |
lines[line_idx] = [] | |
lines[line_idx].append({"code": code, "text": text}) | |
changes = TextDiffs() | |
for line_idx, data in lines.items(): | |
if len(data) > 1: | |
if data[0]["code"] == "-" and data[1]["code"] == "?": | |
changes.changed.append( | |
LineChanged( | |
line=line_idx, | |
before=data[0]["text"], | |
after=data[2]["text"], | |
idxs=get_diff_positions(data[3]["text"]), | |
) | |
) | |
continue | |
if data[0]["code"] == "-": | |
changes.removed.append(LineRemoved(line=line_idx, content=data[0]["text"])) | |
if data[0]["code"] == "+": | |
changes.added.append(LineAdded(line=line_idx, content=data[0]["text"])) | |
return changes | |
def get_file_diffs(source: str, new: str) -> TextDiffs: | |
with open(source, "rt", encoding="utf-8") as file: | |
content1 = file.read() | |
with open(new, "rt", encoding="utf-8") as file: | |
content2 = file.read() | |
return get_text_diffs(content1, content2) | |
changes = get_file_diffs( | |
source=r"C:\file1.csv", | |
new=r"C:\file2.csv", | |
) | |
print(changes.all) |
Author
tinytengu
commented
Aug 30, 2023
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment