Skip to content

Instantly share code, notes, and snippets.

@danyaljj
Created October 26, 2021 20:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save danyaljj/ab6f246bbe849822ab8f129c07bfe7d4 to your computer and use it in GitHub Desktop.
Save danyaljj/ab6f246bbe849822ab8f129c07bfe7d4 to your computer and use it in GitHub Desktop.
import random
from collections import Counter
from urllib.parse import urlparse
import json
import os
import re
from tqdm import tqdm
urls_counts = {}
years_count = {}
classes = {
'www.nytimes.com': 0,
'www.washingtonpost.com': 1,
'www.foxnews.com': 2
}
combined_years = {
'2009': '2009_2010',
'2010': '2009_2010',
'2011': '2011_2012',
'2012': '2011_2012',
'2013': '2013_2014',
'2014': '2013_2014',
'2015': '2015_2016',
'2016': '2015_2016',
}
adaptation_per_year = {}
def process_file(file, split):
infile = open(file, 'r')
count_total = 0
count_filtered = 0
summarization_rows_per_year = {}
classification_rows_per_year_and_class = {}
for line in tqdm(infile.readlines()):
jsonline = json.loads(line)
url = jsonline['url']
o = urlparse(url)
url_cleaned = o.netloc
text = jsonline['text']
summary = jsonline['summary']
date = jsonline['date']
year = date[:4]
count_total += 1
if year not in url:
count_filtered += 1
continue
# find mentions of years in the summary
years_mentioned = re.findall('[ .,\/]\d{4}[ .,\/]', summary) + re.findall('[ .,\/]\d{4}[ .,\/]', text)
if len(years_mentioned) > 0:
years_mentioned = [x.replace(".", "").replace("/", "").replace(",", "") for x in years_mentioned]
valid_years = [x for x in years_mentioned if int(x) > int(year) and int(year) <= 2016]
# print(valid_years)
# if there are any "future" years in the article, it is very likely invalid date
if len(valid_years) > 0:
continue
if int(year) < 2009:
continue
year = combined_years[year]
if year not in years_count:
years_count[year] = 0
years_count[year] += 1
if url_cleaned not in urls_counts:
urls_counts[url_cleaned] = 0
urls_counts[url_cleaned] += 1
if year not in summarization_rows_per_year:
summarization_rows_per_year[year] = []
classification_rows_per_year_and_class[f"{year}_0"] = []
classification_rows_per_year_and_class[f"{year}_1"] = []
classification_rows_per_year_and_class[f"{year}_2"] = []
# adaptation_per_year[year] = ""
summarization_rows_per_year[year].append(
{"year": year, "score": 0, "text": text, "summary": summary, "label": 0}
)
# adaptation_per_year[year] += text
if url_cleaned in classes.keys():
cls = classes[url_cleaned]
classification_rows_per_year_and_class[f"{year}_{cls}"].append(
{"year": year, "score": 0, "text": text, "label": classes[url_cleaned]}
)
print(urls_counts)
print(years_count)
# summarization
if False:
summarization_min_count = min([len(summarization_rows_per_year[str(year)]) for year in combined_years.values()])
print(f" * min count: {summarization_min_count}")
for year in combined_years.values():
year = str(year)
name = f'{year}.{split}.forlabels.scores'
if split == 'train':
name = 'indivs/' + name
outfile = open(f"/Users/danielk/ideaProjects/temporal_drift/newsroom/summarization/{split}/{name}", '+w')
for row in summarization_rows_per_year[year][:summarization_min_count]:
outfile.write(json.dumps(row) + "\n")
# unlabeled text
if True:
summarization_min_count = min([len(summarization_rows_per_year[str(year)]) for year in combined_years.values()])
print(f" * min count: {summarization_min_count}")
for year in combined_years.values():
year = str(year)
name = f'{year}.{split}.txt'
if split == 'train':
name = 'indivs/' + name
outfile = open(f"/Users/danielk/ideaProjects/temporal_drift/newsroom/newsroom_dapt/{split}/{name}", '+w')
for row in summarization_rows_per_year[year][:summarization_min_count]:
outfile.write(row['text'] + "\n")
# source classification
if False:
# balance for each classes
print(classification_rows_per_year_and_class.keys())
print([len(classification_rows_per_year_and_class[f"{year}_0"]) for year in combined_years.values()])
print([len(classification_rows_per_year_and_class[f"{year}_1"]) for year in combined_years.values()])
print([len(classification_rows_per_year_and_class[f"{year}_2"]) for year in combined_years.values()])
classification_min_count_across_years_and_cats = min(
[len(classification_rows_per_year_and_class[f"{year}_0"]) for year in combined_years.values()] +
[len(classification_rows_per_year_and_class[f"{year}_1"]) for year in combined_years.values()] +
[len(classification_rows_per_year_and_class[f"{year}_2"]) for year in combined_years.values()]
)
print(f" * min count: {classification_min_count_across_years_and_cats}")
for year in combined_years.values():
year = str(year)
name = f'{year}.{split}.forlabels.scores'
if split == 'train':
name = 'indivs/' + name
outfile = open(f"/Users/danielk/ideaProjects/temporal_drift/newsroom/newsroom_source_classification/{split}/{name}", '+w')
rows = classification_rows_per_year_and_class[f"{year}_0"][:classification_min_count_across_years_and_cats]
rows += classification_rows_per_year_and_class[f"{year}_1"][:classification_min_count_across_years_and_cats]
rows += classification_rows_per_year_and_class[f"{year}_2"][:classification_min_count_across_years_and_cats]
random.shuffle(rows)
for row in rows:
outfile.write(json.dumps(row) + "\n")
process_file('/Users/danielk/ideaProjects/temporal_drift/newsroom/release/dev.jsonl', 'dev')
process_file('/Users/danielk/ideaProjects/temporal_drift/newsroom/release/test.jsonl', 'test')
process_file('/Users/danielk/ideaProjects/temporal_drift/newsroom/release/train.jsonl', 'train')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment