Last active
April 4, 2023 00:51
-
-
Save fulcrum6378/f287d776a5882eb6f9064e455ff0bdb4 to your computer and use it in GitHub Desktop.
Reorganiser for the exported followers_and_following page of Instagram! (change the timezone info to your own)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
from datetime import datetime | |
from typing import Optional | |
from bs4 import BeautifulSoup | |
from persiantools.digits import fa_to_en | |
from persiantools.jdatetime import JalaliDateTime | |
from pytz import timezone | |
if len(sys.argv) <= 1: | |
print(""" | |
HOW-TO: | |
$ python ig_follow_lists.py <FILENAME|all> | |
For processing "all" the files in a folder, execute this command before that: | |
$ cd /path/to/folder | |
""") | |
quit() | |
org_zone = timezone("US/Pacific") | |
dst_zone = timezone("Asia/Tehran") | |
# Use pytz necessarily for Iran; because it has applied the recent law changes to the daylight savings! | |
months = { | |
"Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4, "May": 5, "Jun": 6, | |
"Jul": 7, "Aug": 8, "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12 | |
} | |
months_persian = { | |
"ژانویه": 1, "فوریه": 2, "مارس": 3, "آوریل": 4, "مه": 5, "ژوئن": 6, | |
"ژوئیه": 7, "اوت": 8, "سپتامبر": 9, "اکتبر": 10, "نوامبر": 11, "دسامبر": 12 | |
} | |
if sys.argv[1] == 'all': | |
files = list(os.listdir()) | |
to_be_omitted = list() | |
for i in range(len(files)): | |
if not files[i].endswith('.html'): | |
to_be_omitted.append(i) | |
for o in reversed(to_be_omitted): | |
files.pop(o) | |
else: | |
files = [sys.argv[1]] | |
is_persian: Optional[bool] = None | |
for f in files: | |
data = open(f, 'r', encoding='utf-8').read() | |
html: BeautifulSoup = BeautifulSoup(data, 'html.parser') | |
del data | |
divs = html.body.find_all('div', attrs={'class': 'uiBoxWhite'}) | |
del html | |
out = "" | |
errored = 0 | |
iDiv = 0 | |
for div in divs: | |
try: | |
u: str | |
if len(div.contents) == 2: # the rest of the pages | |
content = div.contents[0].contents[0].contents | |
u = content[0].contents[0].string | |
elif len(div.contents) == 3: # the 'blocked_accounts' page | |
content = div.contents[1].contents[0].contents | |
u = div.contents[0].string | |
else: | |
raise Exception("UNKNOWN DIV TYPE: " + f + " : " + str(iDiv) + " contents=" + str(len(div.contents))) | |
raw_datetime: str = content[1].string | |
if is_persian is None: | |
for mp in months_persian.keys(): | |
if mp in raw_datetime: | |
is_persian = True | |
if is_persian is None: | |
is_persian = False | |
out += u + ((30 - len(u)) * " ") | |
if is_persian: raw_datetime = fa_to_en(raw_datetime.replace("", "")) | |
raw_date, raw_time = raw_datetime.split(", " if not is_persian else "، ") | |
raw_date = raw_date.split(" ") | |
raw_time = raw_time.split(":") | |
dt = datetime(int(raw_date[2]), | |
(months if not is_persian else months_persian)[raw_date[1]], | |
int(raw_date[0]), | |
int(raw_time[0]), | |
int(raw_time[1]), | |
tzinfo=org_zone).astimezone(dst_zone) | |
jdt = JalaliDateTime(dt) | |
out += "{:04d}".format(dt.year) + "." + "{:02d}".format(dt.month) + "." + "{:02d}".format(dt.day) + " - " | |
out += "{:04d}".format(jdt.year + 5000) + "." + "{:02d}".format(jdt.month) + "." + "{:02d}".format(jdt.day) | |
out += " - " + "{:02d}".format(dt.hour) + ":" + "{:02d}".format(dt.minute) + " " + dt.tzname() | |
out += "\n" | |
del content, u, raw_datetime, raw_date, raw_time, dt, jdt | |
except: | |
errored += 1 | |
iDiv += 1 | |
if errored > 0: | |
print(errored, "invalid objects!") | |
del errored, iDiv | |
open(f[0:f.rindex(".")] + ".txt", 'w', encoding='utf-8').write(out) | |
print(f, "is done.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment