Last active
November 3, 2020 19:00
-
-
Save jaymody/6bbcf72182fedc137b28d350c451f8cc to your computer and use it in GitHub Desktop.
Combining varying datasets into a single consistent schema for Fake News Detection. A great example of inheritance and other neat OOP python features.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Data classes and loading functions.""" | |
import os | |
import glob | |
import json | |
import shutil | |
import logging | |
import collections | |
import multiprocessing | |
import bs4 | |
import tldextract | |
from tqdm.auto import tqdm | |
from .preprocessing import clean_text | |
_logger = logging.getLogger(__name__) | |
class Claim: | |
"""A claim.""" | |
def __init__( | |
self, | |
id, | |
claim, | |
claimant=None, | |
label=None, | |
date=None, | |
related_articles=None, | |
explanation=None, | |
support=None, | |
dataset_name=None, | |
): | |
"""Constructor for Claim.""" | |
self.id = id | |
self.claim = clean_text(claim)[:4000] if claim else None # restrict num chars | |
self.claimant = clean_text(claimant) if claimant else None | |
self.label = label | |
self.date = str(date) | |
self.related_articles = related_articles | |
self.explanation = explanation | |
self.support = support | |
self.dataset_name = dataset_name | |
if dataset_name: | |
self.index = dataset_name + "/" + str(self.id) | |
else: | |
self.index = self.id | |
def to_dict(self): | |
return self.__dict__ | |
@classmethod | |
def from_dict(cls, d): | |
return cls(**d) | |
def logstr(self, fields_to_keep=[]): | |
fields_to_keep += ["id", "claim", "claimant", "date"] | |
_out_dict = {k: v for k, v in self.__dict__.items() if k in fields_to_keep} | |
return json.dumps(_out_dict, indent=2) | |
def __repr__(self): | |
return json.dumps(self.__dict__, indent=2) | |
def __eq__(self, other): | |
return self.claim == other.claim | |
def __hash__(self): | |
return hash(self.claim) | |
class Article: | |
"""An article.""" | |
def __init__( | |
self, | |
id, | |
content=None, | |
title=None, | |
source=None, | |
url=None, | |
date=None, | |
dataset_name=None, | |
): | |
"""Constructor for Article.""" | |
self.id = id | |
self.title = title[:4000] if title else None # restrict num chars | |
self.content = ( | |
content[:16000] if content else None | |
) # restrict num chars in claim | |
self.source = tldextract.extract(url).domain if url else None | |
self.url = url | |
self.date = date | |
self.dataset_name = dataset_name | |
if dataset_name: | |
self.index = dataset_name + "/" + str(self.id) | |
else: | |
self.index = self.id | |
def to_dict(self): | |
return self.__dict__ | |
@classmethod | |
def from_dict(cls, d): | |
return cls(**d) | |
@classmethod | |
def from_txt(cls, id, text, **kwargs): | |
"""Construct an Article given text.""" | |
title = text.partition("\n")[0] | |
text = clean_text(text) | |
return cls(id, content=text, title=title, **kwargs) | |
@classmethod | |
def from_html(cls, id, html, **kwargs): | |
"""Constructs an Article given an html text.""" | |
def tag_visible(element): | |
whitelist = ["h1", "h2", "h3", "h4", "h5", "body", "p", "font"] | |
if element.parent.name not in whitelist: | |
return False | |
if isinstance(element, bs4.Comment): | |
return False | |
return True | |
soup = bs4.BeautifulSoup(html, "html.parser") | |
# if not valid html, might be already preprocessed text | |
if not bool(soup.find()): | |
text = clean_text(html) | |
return cls(id, content=text, **kwargs) | |
texts = soup.findAll(text=True) | |
texts = filter(tag_visible, texts) | |
text = "" | |
for t in texts: | |
t = clean_text(t) | |
if t and len(t) > 32: # dissallow empty/short text sequences | |
text += t + " " | |
if "title" not in kwargs: | |
title = soup.title if soup.title and soup.title.string else None | |
title = clean_text(title.string) if title else None | |
return cls(id, content=text, title=title, **kwargs) | |
def logstr(self, fields_to_keep=[]): | |
fields_to_keep += ["id", "title", "content", "source"] | |
_out_dict = {k: v for k, v in self.__dict__.items() if k in fields_to_keep} | |
if _out_dict["content"] and len(_out_dict["content"]) > 400: | |
_out_dict["content"] = ( | |
_out_dict["content"][:200] + " ... " + _out_dict["content"][-200:] | |
) | |
return json.dumps(_out_dict, indent=2) | |
def __repr__(self): | |
return json.dumps(self.__dict__, indent=2) | |
def __eq__(self, other): | |
return self.index == other.index | |
def __hash__(self): | |
return hash(self.index) | |
def combine_claims(claims_lists, logging_names=None): | |
"""The head of the list has most priority during union, the tail has the least.""" | |
if logging_names and len(claims_lists) != len(logging_names): | |
raise ValueError("len claims_list must be equal to len logging_names") | |
_logger.info("... combining claims ...") | |
combined_claims_set = set() | |
for i, claims in enumerate(claims_lists): | |
prev_len = len(combined_claims_set) | |
combined_claims_set = combined_claims_set | set(claims) | |
_logger.info( | |
"%s: %d --> %d (+ %d = %d - %d)", | |
logging_names[i] if logging_names else str(i), | |
prev_len, | |
len(combined_claims_set), | |
len(combined_claims_set) - prev_len, | |
len(claims), | |
prev_len + len(claims) - len(combined_claims_set), | |
) | |
return list(combined_claims_set) | |
def _save_relevant_articles_phase1(metadata, articles_dir, output_dir): | |
# find set of relevant articles in trimmed down dataset | |
relevant_articles = set() | |
for d in metadata: | |
relevant_articles.update(d["related_articles"]) | |
# copy relevant articles to output directory | |
for fpath in tqdm(glob.glob(os.path.join(articles_dir, "*.txt"))): | |
article_id = os.path.basename(fpath).split(".")[0] | |
if int(article_id) in relevant_articles: | |
shutil.copyfile( | |
fpath, os.path.join(output_dir, "articles", os.path.basename(fpath)) | |
) | |
return len(relevant_articles) | |
def _save_relevant_articles_phase2(metadata, articles_dir, output_dir): | |
# find set of relevant articles in trimmed down dataset | |
relevant_articles = set() | |
for d in metadata: | |
relevant_articles.update( | |
int(os.path.basename(n).split(".")[0]) for n in d["related_articles"] | |
) | |
# copy relevant articles to output directory | |
for fpath in tqdm(glob.glob(os.path.join(articles_dir, "*.html"))): | |
article_id = os.path.basename(fpath).split(".")[0] | |
if int(article_id) in relevant_articles: | |
shutil.copyfile( | |
fpath, os.path.join(output_dir, "articles", os.path.basename(fpath)) | |
) | |
return len(relevant_articles) | |
def trim_metadata_phase1(metadata_file, articles_dir, output_dir, n_examples): | |
os.makedirs(os.path.join(output_dir, "articles")) | |
# load data | |
with open(metadata_file, "r") as fi: | |
raw_data = json.load(fi) | |
# trim down dataset | |
_logger.info("orig len: %d", len(raw_data)) | |
metadata = raw_data[:n_examples] | |
_logger.info("new len: %d", len(metadata)) | |
num_articles = _save_relevant_articles_phase1(metadata, articles_dir, output_dir) | |
_logger.info("len relevant articles set: %d", num_articles) | |
# save trimmed down metadata.json to output directory | |
with open(os.path.join(output_dir, "metadata.json"), "w") as fo: | |
json.dump(metadata, fo, indent=2) | |
def trim_metadata_phase2(metadata_file, articles_dir, output_dir, n_examples=None): | |
os.makedirs(os.path.join(output_dir, "articles")) | |
# load data | |
with open(metadata_file, "r") as fi: | |
raw_data = json.load(fi) | |
# trim down dataset | |
_logger.info("orig len: %d", len(raw_data)) | |
metadata = raw_data[:n_examples] | |
_logger.info("new len: %d", len(metadata)) | |
num_articles = _save_relevant_articles_phase2(metadata, articles_dir, output_dir) | |
_logger.info("len relevant articles set: %d", num_articles) | |
# save trimmed down metadata.json to output directory | |
with open(os.path.join(output_dir, "metadata.json"), "w") as fo: | |
json.dump(metadata, fo, indent=2) | |
def train_test_split_phase2( | |
metadata_file, articles_dir, train_dir, test_dir, train_size, random_state | |
): | |
from sklearn.model_selection import train_test_split | |
if os.path.exists(train_dir): | |
raise ValueError("train_dir ({}) already exists".format(train_dir)) | |
if os.path.exists(test_dir): | |
raise ValueError("test_dir ({}) already exists".format(test_dir)) | |
os.makedirs(os.path.join(train_dir, "articles")) | |
os.makedirs(os.path.join(test_dir, "articles")) | |
with open(metadata_file, "r") as fi: | |
metadata = json.load(fi) | |
# log args | |
_logger.info("metadata_file: %s", metadata_file) | |
_logger.info("articles_dir: %s", articles_dir) | |
_logger.info("train_dir: %s", train_dir) | |
_logger.info("test_dir: %s", test_dir) | |
_logger.info("train_size: %.2f", train_size) | |
_logger.info("random_state: %d", random_state) | |
_logger.info("") | |
# train_test_split | |
all_labels = [d["label"] for d in metadata] | |
training_data, testing_data, _, _ = train_test_split( | |
metadata, | |
all_labels, | |
stratify=all_labels, | |
train_size=train_size, | |
random_state=random_state, | |
) | |
# logging train test split | |
_logger.info("Num Total Claims: %d", len(metadata)) | |
_logger.info("Num Train Claims: %d", len(training_data)) | |
_logger.info("Num Test Claims: %d", len(testing_data)) | |
_logger.info("") | |
all_labels_count = collections.Counter(all_labels) | |
train_labels_count = collections.Counter([d["label"] for d in training_data]) | |
test_labels_count = collections.Counter([d["label"] for d in testing_data]) | |
_logger.info("All Labels Count: %s", str(dict(all_labels_count))) | |
_logger.info("Train Labels Count: %s", str(dict(train_labels_count))) | |
_logger.info("Test Labels Count: %s", str(dict(test_labels_count))) | |
_logger.info("") | |
# articles | |
num_total_articles = len(glob.glob(os.path.join(articles_dir, "*.html"))) | |
_logger.info("Num Total Articles: %d", num_total_articles) | |
num_train_articles = _save_relevant_articles_phase2( | |
training_data, articles_dir, train_dir | |
) | |
_logger.info("Num Train Articles: %d", num_train_articles) | |
num_test_articles = _save_relevant_articles_phase2( | |
testing_data, articles_dir, test_dir | |
) | |
_logger.info("Num Test Articles: %d", num_test_articles) | |
# write metadata | |
with open(os.path.join(train_dir, "metadata.json"), "w") as fo: | |
json.dump(training_data, fo, indent=2) | |
with open(os.path.join(test_dir, "metadata.json"), "w") as fo: | |
json.dump(testing_data, fo, indent=2) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import glob | |
import json | |
import logging | |
import datetime | |
import multiprocessing | |
import tldextract | |
import pandas as pd | |
from tqdm.auto import tqdm | |
from sklearn.model_selection import train_test_split | |
from valerie.data import Claim, Article, combine_claims | |
_logger = logging.getLogger(__name__) | |
#################### | |
####### Base ####### | |
#################### | |
class ValerieDataset: | |
def __init__(self, claims, articles=None, setify=True): | |
if setify: | |
self.claims = list(set(claims)) | |
_logger.info( | |
"%s claims set change %d --> %d", | |
self.__class__.__name__, | |
len(claims), | |
len(self.claims), | |
) | |
else: | |
self.claims = claims | |
_logger.info("len of claims: %d", len(self.claims)) | |
if articles: | |
self.articles = list(set(articles)) | |
_logger.info( | |
"%s articles set change %d --> %d", | |
self.__class__.__name__, | |
len(articles), | |
len(self.articles), | |
) | |
def train_test_split(self, **kwargs): | |
_logger.info("... performing train_test_split ...",) | |
_labels = [claim.label for claim in self.claims] | |
train_claims, test_claims, _, _ = train_test_split( | |
self.claims, _labels, stratify=_labels, **kwargs | |
) | |
self.train_claims = train_claims | |
self.test_claims = test_claims | |
_logger.info("len of all claims: %d", len(self.claims)) | |
_logger.info("len of train claims: %d", len(self.train_claims)) | |
_logger.info("len of test claims: %d", len(self.test_claims)) | |
def train_test_split_subdataset(self, subdataset_name, **kwargs): | |
"""Train test split for a subdataset of a combined dataset. | |
Performs a train test split on the specified subdataset that's within the | |
current combined dataset. This is useful if you want all your test data | |
to only come from a single dataset, rather than all the combined ones. | |
""" | |
_logger.info( | |
"... performing train_test_split_subdataset on subdataset %s ...", | |
subdataset_name, | |
) | |
sub_claims = [ | |
claim for claim in self.claims if claim.dataset_name == subdataset_name | |
] | |
not_sub_claims = [ | |
claim for claim in self.claims if claim.dataset_name != subdataset_name | |
] | |
sub_labels = [claim.label for claim in sub_claims] | |
train_claims, test_claims, _, _ = train_test_split( | |
sub_claims, sub_labels, stratify=sub_labels, **kwargs | |
) | |
self.train_claims = train_claims + not_sub_claims | |
self.test_claims = test_claims | |
_logger.info("len of all claims: %d", len(self.claims)) | |
_logger.info("len of train claims: %d", len(self.train_claims)) | |
_logger.info("len of test claims: %d", len(self.test_claims)) | |
@classmethod | |
def df_to_claims(cls, df, row_to_claim): | |
claims = [] | |
misses = 0 | |
for i, row in tqdm( | |
df.iterrows(), total=len(df), desc="{} to claims".format(cls.__name__) | |
): | |
# if phase1/phase2, do not try/except an error when parsing df | |
if cls.__name__ in [ | |
Phase1Dataset.__name__, | |
Phase1Dataset.__name__, | |
Phase2TrialDataset.__name__, | |
]: | |
claims.append(row_to_claim(i, row)) | |
else: | |
try: | |
claims.append(row_to_claim(i, row)) | |
except: | |
misses += 1 | |
continue | |
_logger.info("missed row to claim conversions: %d", misses) | |
return claims | |
@classmethod | |
def from_raw(cls): | |
raise NotImplementedError | |
@classmethod | |
def from_all_filter(cls, claims, articles=None): | |
_claims = [c for c in claims if c.dataset_name == cls.__name__] | |
if articles: | |
_articles = [a for a in articles if a.dataset_name == cls.__name__] | |
return cls(_claims, _articles) | |
#################### | |
##### Internal ##### | |
#################### | |
class Phase1Dataset(ValerieDataset): | |
@classmethod | |
def from_raw( | |
cls, metadata_file="data/phase1/raw/metadata.json", articles_dir=None, nproc=1 | |
): | |
# because pandas logging sucks | |
if not os.path.isfile(metadata_file): | |
raise ValueError( | |
"metadata file {} was not found or is not file".format(metadata_file) | |
) | |
if articles_dir and not os.path.isdir(articles_dir): | |
raise ValueError( | |
"articles dir {} was not found or is not dir".format(articles_dir) | |
) | |
df = pd.read_json(metadata_file) | |
claims = cls.df_to_claims(df, cls.row_to_claim) | |
articles = None | |
if articles_dir: | |
articles = cls.articles_from_phase1(articles_dir, nproc) | |
return cls(claims, articles) | |
@classmethod | |
def row_to_claim(cls, i, row): | |
row = dict(row) | |
_id = row.pop("id") | |
# only parse related articles if it exists | |
# (we do this check since related_articles is a removed field for the eval) | |
related_articles = {} | |
if "related_articles" in row: | |
for rel_art in row.pop("related_articles"): | |
rel_art = cls.__name__ + "/" + str(rel_art) + ".txt" | |
related_articles[rel_art] = rel_art | |
return Claim( | |
_id, related_articles=related_articles, dataset_name=cls.__name__, **row | |
) | |
@staticmethod | |
def articles_from_phase1(articles_dir, nproc=1): | |
fpaths = glob.glob(os.path.join(articles_dir, "*.txt")) | |
pool = multiprocessing.Pool(nproc) | |
articles = [] | |
for article in tqdm( | |
pool.imap_unordered(_articles_from_phase1_visit, fpaths), | |
total=len(fpaths), | |
desc="loading articles from phase1", | |
): | |
articles.append(article) | |
return articles | |
def _articles_from_phase1_visit(fpath): | |
with open(fpath, encoding="utf8") as fi: | |
art_id = os.path.basename(fpath) | |
article = Article.from_txt( | |
art_id, fi.read(), dataset_name=Phase1Dataset.__name__ | |
) | |
return article | |
class Phase2Dataset(ValerieDataset): | |
@classmethod | |
def from_raw( | |
cls, | |
metadata_file="data/phase2-3/raw/metadata.json", | |
articles_dir=None, | |
nproc=1, | |
setify=True, | |
): | |
# because pandas logging sucks | |
if not os.path.isfile(metadata_file): | |
raise ValueError( | |
"metadata file {} was not found or is not file".format(metadata_file) | |
) | |
if articles_dir and not os.path.isdir(articles_dir): | |
raise ValueError( | |
"articles dir {} was not found or is not dir".format(articles_dir) | |
) | |
df = pd.read_json(metadata_file) | |
claims = cls.df_to_claims(df, cls.row_to_claim) | |
articles = None | |
if articles_dir: | |
articles = cls.articles_from_phase2(articles_dir, claims, nproc=nproc) | |
return cls(claims, articles, setify=setify) | |
@classmethod | |
def row_to_claim(cls, i, row): | |
row = dict(row) | |
_id = row.pop("id") | |
# only parse related articles if it exists | |
# (we do this check since related_articles is a removed field for the eval) | |
related_articles = {} | |
if "related_articles" in row: | |
for k, v in row.pop("related_articles").items(): | |
rel_art = cls.__name__ + "/" + os.path.basename(k) | |
related_articles[rel_art] = v | |
return Claim( | |
_id, related_articles=related_articles, dataset_name=cls.__name__, **row | |
) | |
@staticmethod | |
def articles_from_phase2(articles_dir, claims, nproc=1): | |
fpaths = glob.glob(os.path.join(articles_dir, "*.html")) | |
pool = multiprocessing.Pool(nproc) | |
articles = [] | |
for article in tqdm( | |
pool.imap_unordered(_articles_from_phase2_visit, fpaths), | |
total=len(fpaths), | |
desc="loading article from phase2", | |
): | |
articles.append(article) | |
# fetch the urls for each article from the claims and perform tldextract | |
misses = 0 | |
art_index_to_url = { | |
k: v for claim in claims for k, v in claim.related_articles.items() | |
} | |
for article in articles: | |
try: | |
article.url = art_index_to_url[article.index] | |
except: | |
misses += 1 | |
continue | |
article.source = tldextract.extract(article.url).domain | |
_logger.info("missed art index to url conversions: %d", misses) | |
return articles | |
def _articles_from_phase2_visit(fpath): | |
with open(fpath, encoding="utf8") as fi: | |
art_id = os.path.basename(fpath) | |
article = Article.from_html( | |
art_id, fi.read(), dataset_name=Phase2Dataset.__name__ | |
) | |
return article | |
class Phase2DisjointDataset(Phase2Dataset): | |
@classmethod | |
def from_raw(cls, unlabelled_metadata_file, labelled_metadata_file): | |
with open(unlabelled_metadata_file) as fi: | |
trial_metadata_unlabelled = json.load(fi) | |
with open(labelled_metadata_file) as fi: | |
trial_labels = json.load(fi) | |
trial_metadata = [ | |
{ | |
**claim, | |
"label": trial_labels[str(claim["id"])]["label"], | |
"related_articles": trial_labels[str(claim["id"])]["related_articles"], | |
} | |
for claim in trial_metadata_unlabelled | |
] | |
df = pd.DataFrame(trial_metadata) | |
claims = cls.df_to_claims(df, cls.row_to_claim) | |
return cls(claims) | |
class Phase2TrialDataset(Phase2DisjointDataset): | |
@classmethod | |
def from_raw( | |
cls, | |
unlabelled_metadata_file="data/phase2-trial/raw/2_trial_metadata.json", | |
labelled_metadata_file="data/phase2-trial/raw/2_trial_labels.json", | |
): | |
return super().from_raw( | |
unlabelled_metadata_file=unlabelled_metadata_file, | |
labelled_metadata_file=labelled_metadata_file, | |
) | |
class Phase2Validation100Dataset(Phase2DisjointDataset): | |
@classmethod | |
def from_raw( | |
cls, | |
unlabelled_metadata_file="data/phase2-validation-100/raw/metadata.json", | |
labelled_metadata_file="data/phase2-validation-100/raw/labels.json", | |
): | |
return super().from_raw( | |
unlabelled_metadata_file=unlabelled_metadata_file, | |
labelled_metadata_file=labelled_metadata_file, | |
) | |
class Phase2Validation500Dataset(Phase2DisjointDataset): | |
@classmethod | |
def from_raw( | |
cls, | |
unlabelled_metadata_file="data/phase2-validation-500/raw/val_metadata_p2.json", | |
labelled_metadata_file="data/phase2-validation-500/raw/2_labels.json", | |
): | |
return super().from_raw( | |
unlabelled_metadata_file=unlabelled_metadata_file, | |
labelled_metadata_file=labelled_metadata_file, | |
) | |
#################### | |
##### External ##### | |
#################### | |
class FakeNewsTop50Dataset(ValerieDataset): | |
"""https://github.com/BuzzFeedNews/2018-12-fake-news-top-50.git""" | |
@classmethod | |
def from_raw( | |
cls, | |
top_csv="data/external/2018-12-fake-news-top-50/data/top_2018.csv", | |
sites_csvs=[ | |
"data/external/2018-12-fake-news-top-50/data/sites_2016.csv", | |
"data/external/2018-12-fake-news-top-50/data/sites_2017.csv", | |
"data/external/2018-12-fake-news-top-50/data/sites_2018.csv", | |
], | |
): | |
df = pd.read_csv(top_csv) | |
sites = [] | |
for sites_csv in sites_csvs: | |
with open(sites_csv) as fi: | |
sites += fi.read().splitlines() | |
sites = list(set(sites)) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
dataset.sites = sites | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
# TODO: consider lowercasing the input claim (all words | |
# start with capital currently) | |
return Claim( | |
str(i), | |
claim=row["title"], | |
date=row["published_date"], | |
claimant="Facebook user", | |
label=0, | |
dataset_name=cls.__name__, | |
) | |
class FakeNewsKaggleDataset(ValerieDataset): | |
"""https://www.kaggle.com/c/fake-news/""" | |
@classmethod | |
def from_raw(cls, train_csv="data/external/fake-news/train.csv"): | |
df = pd.read_csv(train_csv) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
# label 0 for reliable | |
# label 1 for unreliable | |
return Claim( | |
str(i), | |
claim=row["title"], | |
claimant=row["author"], | |
label=0 if row["label"] else 2, | |
dataset_name=cls.__name__, | |
) | |
class FakeNewsNetDataset(ValerieDataset): | |
"""https://github.com/KaiDMML/FakeNewsNet.git""" | |
@classmethod | |
def from_raw( | |
cls, | |
politifact_fake_csv="data/external/FakeNewsNet/dataset/politifact_fake.csv", | |
politifact_real_csv="data/external/FakeNewsNet/dataset/politifact_real.csv", | |
gossipcop_fake_csv="data/external/FakeNewsNet/dataset/gossipcop_fake.csv", | |
gossipcop_real_csv="data/external/FakeNewsNet/dataset/gossipcop_real.csv", | |
name="fake_news_net", | |
): | |
df = pd.concat( | |
[ | |
pd.read_csv(politifact_fake_csv).assign(label=0), | |
pd.read_csv(politifact_real_csv).assign(label=2), | |
pd.read_csv(gossipcop_fake_csv).assign(label=0), | |
pd.read_csv(gossipcop_real_csv).assign(label=2), | |
], | |
ignore_index=True, | |
) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
return Claim( | |
str(i), | |
claim=row["title"], | |
claimant=tldextract.extract(row["news_url"]).domain, | |
label=row["label"], | |
dataset_name=cls.__name__, | |
) | |
class GeorgeMcIntireDataset(ValerieDataset): | |
"""https://github.com/GeorgeMcIntire""" | |
@classmethod | |
def from_raw(cls, data_csv="data/external/george-mcintire/fake_or_real_news.csv"): | |
df = pd.read_csv(data_csv, skiprows=1, names=["id", "title", "text", "label"]) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
return Claim( | |
str(i), | |
claim=row["title"], | |
label=0 if row["label"] == "FAKE" else 1, | |
dataset_name=cls.__name__, | |
) | |
class ISOTDataset(ValerieDataset): | |
"""https://www.uvic.ca/engineering/ece/isot/datasets/""" | |
@classmethod | |
def from_raw( | |
cls, | |
fake_csv="data/external/ISOT/Fake.csv", | |
true_csv="data/external/ISOT/True.csv", | |
): | |
df = pd.concat( | |
[ | |
pd.read_csv(fake_csv).assign(label=0), | |
pd.read_csv(true_csv).assign(label=2), | |
], | |
ignore_index=True, | |
) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
try: # December 31, 2017 | |
_date = datetime.datetime.strptime(row["date"], "%B %d, %Y") | |
except: # 19-Feb-18 | |
try: | |
_date = datetime.datetime.strptime(row["date"], "%d-%b-%y") | |
except: # Dec 31, 2017 | |
try: | |
_date = datetime.datetime.strptime(row["date"], "%b %d, %Y") | |
except: | |
_date = None | |
return Claim( | |
str(i), | |
claim=row["title"], | |
date=_date.strftime("%Y-%m-%d") if _date else None, | |
label=row["label"], | |
dataset_name=cls.__name__, | |
) | |
class LiarDataset(ValerieDataset): | |
"""https://www.cs.ucsb.edu/~william/data/liar_dataset.zip""" | |
@classmethod | |
def from_raw(cls, data_tsv="data/external/liar/train.tsv"): | |
df = pd.read_csv( | |
data_tsv, | |
sep="\t", | |
names=[ | |
"id", | |
"label", | |
"statement", | |
"subject(s)", | |
"speaker", | |
"speaker's job title", | |
"state info", | |
"party affiliation", | |
"total credit history count", | |
"barely true counts", | |
"false counts", | |
"half true counts", | |
"mostly true counts", | |
"context (venue/location of speech or statement)", | |
"pants on fire counts", | |
], | |
) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
if row["label"] == "false": | |
_lab = 0 | |
elif row["label"] == "true": | |
_lab = 2 | |
else: | |
_lab = 1 | |
return Claim( | |
str(i), | |
claim=row["statement"], | |
claimant=row["speaker"] if isinstance(row["speaker"], str) else None, | |
label=_lab, | |
dataset_name=cls.__name__, | |
) | |
class MrisdalDataset(ValerieDataset): | |
"""https://www.kaggle.com/mrisdal/fake-news""" | |
@classmethod | |
def from_raw(cls, data_csv="data/external/mrisdal/fake.csv"): | |
df = pd.read_csv(data_csv) | |
dataset = cls(cls.df_to_claims(df, cls.row_to_claim)) | |
dataset.df = df | |
return dataset | |
@classmethod | |
def row_to_claim(cls, i, row): | |
if row["ord_in_thread"] != 0: | |
raise ValueError("must be main post") | |
return Claim( | |
str(i), | |
claim=row["title"], | |
claimant=row["site_url"], | |
date=datetime.datetime.strptime( | |
row["published"].split("T")[0], "%Y-%m-%d" | |
).strftime("%Y-%m-%d"), | |
label=0, | |
dataset_name=cls.__name__, | |
) | |
#################### | |
##### Combined ##### | |
#################### | |
class LeadersDataset(ValerieDataset): | |
@classmethod | |
def from_raw(cls): | |
datasets = [ | |
Phase2Dataset.from_raw(), | |
Phase1Dataset.from_raw(), | |
] | |
assert isinstance(datasets[0], Phase2Dataset) | |
return cls(combine_datasets_claims(datasets)) | |
class Phase2CombinedDataset(ValerieDataset): | |
@classmethod | |
def from_raw( | |
cls, datasets=[], | |
): | |
datasets = [Phase2Dataset.from_raw()] + [ | |
dataset.from_raw() for dataset in datasets | |
] | |
assert isinstance(datasets[0], Phase2Dataset) | |
return cls(combine_datasets_claims(datasets)) | |
class CombinedDataset(ValerieDataset): | |
@classmethod | |
def from_raw( | |
cls, | |
datasets=[ | |
Phase2Dataset, | |
Phase1Dataset, | |
FakeNewsTop50Dataset, | |
FakeNewsKaggleDataset, | |
FakeNewsNetDataset, | |
GeorgeMcIntireDataset, | |
ISOTDataset, | |
LiarDataset, | |
MrisdalDataset, | |
], | |
): | |
datasets = [dataset.from_raw() for dataset in datasets] | |
return cls(combine_datasets_claims(datasets)) | |
def combine_datasets_claims(datasets): | |
claims_lists = [dataset.claims for dataset in datasets] | |
logging_names = [dataset.__class__.__name__ for dataset in datasets] | |
return combine_claims(claims_lists, logging_names=logging_names) | |
name_to_dataset = { | |
Phase1Dataset.__name__: Phase1Dataset, | |
Phase2Dataset.__name__: Phase2Dataset, | |
Phase2DisjointDataset.__name__: Phase2DisjointDataset, | |
Phase2TrialDataset.__name__: Phase2TrialDataset, | |
Phase2Validation100Dataset.__name__: Phase2Validation100Dataset, | |
Phase2Validation500Dataset.__name__: Phase2Validation500Dataset, | |
FakeNewsTop50Dataset.__name__: FakeNewsTop50Dataset, | |
FakeNewsKaggleDataset.__name__: FakeNewsKaggleDataset, | |
FakeNewsNetDataset.__name__: FakeNewsNetDataset, | |
GeorgeMcIntireDataset.__name__: GeorgeMcIntireDataset, | |
ISOTDataset.__name__: ISOTDataset, | |
LiarDataset.__name__: LiarDataset, | |
MrisdalDataset.__name__: MrisdalDataset, | |
LeadersDataset.__name__: LeadersDataset, | |
Phase2CombinedDataset.__name__: Phase2CombinedDataset, | |
CombinedDataset.__name__: CombinedDataset, | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Preprocessing.""" | |
import re | |
import logging | |
import unicodedata | |
import wordninja | |
_logger = logging.getLogger(__name__) | |
def extract_words_from_url(url): | |
"""Extracts words from a url. | |
Example | |
------- | |
input: https://www.berkeleyschools.net/departments/public-information-office/ | |
output: berkeley schools departments public information office | |
""" | |
remove = { | |
"www", | |
"html", | |
"index", | |
"htm", | |
"http:", | |
"https:", | |
"http", | |
"https", | |
"com", | |
"ca", | |
"gov", | |
"org", | |
"net", | |
"co", | |
} | |
words = [ | |
clean_text(w, remove_punctuation=True) | |
for w in split(url, [".", "-", "/", "?", "=", "&"]) | |
] | |
words = [w for word in words for w in wordninja.split(word)] | |
words = [ | |
word | |
for word in words | |
if word and word not in remove and not word.isnumeric() and len(word) > 2 | |
] | |
return words | |
def split(string, delimiters): | |
"""Split a string using multiple delimiters.""" | |
regexPattern = "|".join(map(re.escape, delimiters)) | |
return re.split(regexPattern, string) | |
def clean_text(text, remove_punctuation=False): | |
"""Cleans the text of whitespace and control chars.""" | |
output = [] | |
for char in text: | |
cp = ord(char) | |
if cp == 0 or cp == 0xFFFD or _is_control(char): | |
continue | |
if remove_punctuation and _is_punctuation(char): | |
continue | |
if _is_whitespace(char): | |
if len(output) > 0 and output[-1] == " ": | |
continue | |
output.append(" ") | |
else: | |
output.append(char) | |
return "".join(output).strip() | |
def _is_punctuation(char): | |
cp = ord(char) | |
if ( | |
(cp >= 33 and cp <= 47) | |
or (cp >= 58 and cp <= 64) | |
or (cp >= 91 and cp <= 96) | |
or (cp >= 123 and cp <= 126) | |
): | |
return True | |
cat = unicodedata.category(char) | |
if cat.startswith("P"): | |
return True | |
return False | |
def _is_whitespace(char): | |
if char == " " or char == "\t" or char == "\n" or char == "\r": | |
return True | |
cat = unicodedata.category(char) | |
if cat == "Zs": | |
return True | |
return False | |
def _is_control(char): | |
if char == "\t" or char == "\n" or char == "\r": | |
return False | |
cat = unicodedata.category(char) | |
if cat in ("Cc", "Cf"): | |
return True | |
return False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment