Created
May 17, 2025 21:55
-
-
Save maluta/f7ea9f1f56463cfb66ea3ff9a3c0d2ba to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import zipfile | |
import os | |
import tempfile | |
import re | |
from datetime import datetime | |
import time | |
import argparse | |
def extract_txt_from_zip(zip_file, start_date, end_date): | |
try: | |
with tempfile.TemporaryDirectory() as tmpdirname: | |
txt_file_path = extract_files_from_zip(zip_file, tmpdirname) | |
if txt_file_path: | |
return read_and_filter_txt(txt_file_path, start_date, end_date) | |
else: | |
return None | |
except zipfile.BadZipFile: | |
return None | |
def extract_files_from_zip(zip_file, extract_to): | |
try: | |
with zipfile.ZipFile(zip_file, 'r') as zip_ref: | |
zip_ref.extractall(extract_to) | |
txt_files = [ | |
os.path.join(root, file) | |
for root, _, files in os.walk(extract_to) | |
for file in files if file.endswith('.txt') | |
] | |
return txt_files[0] if txt_files else None | |
except Exception as e: | |
return None | |
def read_and_filter_txt(txt_file_path, start_date, end_date): | |
for encoding in ['utf-8', 'utf-8-sig', 'utf-16', 'latin-1']: | |
try: | |
start_date = datetime.strptime(start_date, "%d/%m/%y") | |
end_date = datetime.strptime(end_date, "%d/%m/%y") | |
trimmed_text = "" | |
include_message = False | |
with open(txt_file_path, "r", encoding=encoding) as file: | |
for line in file: | |
match = re.match(r"(\d{2}/\d{2}/\d{2}),", line) | |
if match: | |
date_str = match.group(1) | |
message_date = datetime.strptime(date_str, "%d/%m/%y") | |
include_message = start_date <= message_date <= end_date | |
if include_message: | |
trimmed_text += line | |
print(">") | |
print(trimmed_text) | |
return trimmed_text | |
except UnicodeDecodeError: | |
continue | |
return None | |
def main(): | |
parser = argparse.ArgumentParser(description='Extract') | |
parser.add_argument('--start_date', type=str, required=True, help='Start date in DD/MM/YY format') | |
parser.add_argument('--end_date', type=str, required=True, help='End date in DD/MM/YY format') | |
parser.add_argument('--zip_path', type=str, required=True, help='Path to the WhatsApp chat zip file') | |
args = parser.parse_args() | |
start_date = args.start_date | |
end_date = args.end_date | |
zip_path = args.zip_path | |
extract_txt_from_zip(zip_path,start_date,end_date) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment