Skip to content

Instantly share code, notes, and snippets.

@jaymody
Last active November 3, 2020 19:00
Show Gist options
  • Save jaymody/6bbcf72182fedc137b28d350c451f8cc to your computer and use it in GitHub Desktop.
Save jaymody/6bbcf72182fedc137b28d350c451f8cc to your computer and use it in GitHub Desktop.
Combining varying datasets into a single consistent schema for Fake News Detection. A great example of inheritance and other neat OOP python features.
"""Data classes and loading functions."""
import os
import glob
import json
import shutil
import logging
import collections
import multiprocessing
import bs4
import tldextract
from tqdm.auto import tqdm
from .preprocessing import clean_text
_logger = logging.getLogger(__name__)
class Claim:
"""A claim."""
def __init__(
self,
id,
claim,
claimant=None,
label=None,
date=None,
related_articles=None,
explanation=None,
support=None,
dataset_name=None,
):
"""Constructor for Claim."""
self.id = id
self.claim = clean_text(claim)[:4000] if claim else None # restrict num chars
self.claimant = clean_text(claimant) if claimant else None
self.label = label
self.date = str(date)
self.related_articles = related_articles
self.explanation = explanation
self.support = support
self.dataset_name = dataset_name
if dataset_name:
self.index = dataset_name + "/" + str(self.id)
else:
self.index = self.id
def to_dict(self):
return self.__dict__
@classmethod
def from_dict(cls, d):
return cls(**d)
def logstr(self, fields_to_keep=[]):
fields_to_keep += ["id", "claim", "claimant", "date"]
_out_dict = {k: v for k, v in self.__dict__.items() if k in fields_to_keep}
return json.dumps(_out_dict, indent=2)
def __repr__(self):
return json.dumps(self.__dict__, indent=2)
def __eq__(self, other):
return self.claim == other.claim
def __hash__(self):
return hash(self.claim)
class Article:
"""An article."""
def __init__(
self,
id,
content=None,
title=None,
source=None,
url=None,
date=None,
dataset_name=None,
):
"""Constructor for Article."""
self.id = id
self.title = title[:4000] if title else None # restrict num chars
self.content = (
content[:16000] if content else None
) # restrict num chars in claim
self.source = tldextract.extract(url).domain if url else None
self.url = url
self.date = date
self.dataset_name = dataset_name
if dataset_name:
self.index = dataset_name + "/" + str(self.id)
else:
self.index = self.id
def to_dict(self):
return self.__dict__
@classmethod
def from_dict(cls, d):
return cls(**d)
@classmethod
def from_txt(cls, id, text, **kwargs):
"""Construct an Article given text."""
title = text.partition("\n")[0]
text = clean_text(text)
return cls(id, content=text, title=title, **kwargs)
@classmethod
def from_html(cls, id, html, **kwargs):
"""Constructs an Article given an html text."""
def tag_visible(element):
whitelist = ["h1", "h2", "h3", "h4", "h5", "body", "p", "font"]
if element.parent.name not in whitelist:
return False
if isinstance(element, bs4.Comment):
return False
return True
soup = bs4.BeautifulSoup(html, "html.parser")
# if not valid html, might be already preprocessed text
if not bool(soup.find()):
text = clean_text(html)
return cls(id, content=text, **kwargs)
texts = soup.findAll(text=True)
texts = filter(tag_visible, texts)
text = ""
for t in texts:
t = clean_text(t)
if t and len(t) > 32: # dissallow empty/short text sequences
text += t + " "
if "title" not in kwargs:
title = soup.title if soup.title and soup.title.string else None
title = clean_text(title.string) if title else None
return cls(id, content=text, title=title, **kwargs)
def logstr(self, fields_to_keep=[]):
fields_to_keep += ["id", "title", "content", "source"]
_out_dict = {k: v for k, v in self.__dict__.items() if k in fields_to_keep}
if _out_dict["content"] and len(_out_dict["content"]) > 400:
_out_dict["content"] = (
_out_dict["content"][:200] + " ... " + _out_dict["content"][-200:]
)
return json.dumps(_out_dict, indent=2)
def __repr__(self):
return json.dumps(self.__dict__, indent=2)
def __eq__(self, other):
return self.index == other.index
def __hash__(self):
return hash(self.index)
def combine_claims(claims_lists, logging_names=None):
"""The head of the list has most priority during union, the tail has the least."""
if logging_names and len(claims_lists) != len(logging_names):
raise ValueError("len claims_list must be equal to len logging_names")
_logger.info("... combining claims ...")
combined_claims_set = set()
for i, claims in enumerate(claims_lists):
prev_len = len(combined_claims_set)
combined_claims_set = combined_claims_set | set(claims)
_logger.info(
"%s: %d --> %d (+ %d = %d - %d)",
logging_names[i] if logging_names else str(i),
prev_len,
len(combined_claims_set),
len(combined_claims_set) - prev_len,
len(claims),
prev_len + len(claims) - len(combined_claims_set),
)
return list(combined_claims_set)
def _save_relevant_articles_phase1(metadata, articles_dir, output_dir):
# find set of relevant articles in trimmed down dataset
relevant_articles = set()
for d in metadata:
relevant_articles.update(d["related_articles"])
# copy relevant articles to output directory
for fpath in tqdm(glob.glob(os.path.join(articles_dir, "*.txt"))):
article_id = os.path.basename(fpath).split(".")[0]
if int(article_id) in relevant_articles:
shutil.copyfile(
fpath, os.path.join(output_dir, "articles", os.path.basename(fpath))
)
return len(relevant_articles)
def _save_relevant_articles_phase2(metadata, articles_dir, output_dir):
# find set of relevant articles in trimmed down dataset
relevant_articles = set()
for d in metadata:
relevant_articles.update(
int(os.path.basename(n).split(".")[0]) for n in d["related_articles"]
)
# copy relevant articles to output directory
for fpath in tqdm(glob.glob(os.path.join(articles_dir, "*.html"))):
article_id = os.path.basename(fpath).split(".")[0]
if int(article_id) in relevant_articles:
shutil.copyfile(
fpath, os.path.join(output_dir, "articles", os.path.basename(fpath))
)
return len(relevant_articles)
def trim_metadata_phase1(metadata_file, articles_dir, output_dir, n_examples):
os.makedirs(os.path.join(output_dir, "articles"))
# load data
with open(metadata_file, "r") as fi:
raw_data = json.load(fi)
# trim down dataset
_logger.info("orig len: %d", len(raw_data))
metadata = raw_data[:n_examples]
_logger.info("new len: %d", len(metadata))
num_articles = _save_relevant_articles_phase1(metadata, articles_dir, output_dir)
_logger.info("len relevant articles set: %d", num_articles)
# save trimmed down metadata.json to output directory
with open(os.path.join(output_dir, "metadata.json"), "w") as fo:
json.dump(metadata, fo, indent=2)
def trim_metadata_phase2(metadata_file, articles_dir, output_dir, n_examples=None):
os.makedirs(os.path.join(output_dir, "articles"))
# load data
with open(metadata_file, "r") as fi:
raw_data = json.load(fi)
# trim down dataset
_logger.info("orig len: %d", len(raw_data))
metadata = raw_data[:n_examples]
_logger.info("new len: %d", len(metadata))
num_articles = _save_relevant_articles_phase2(metadata, articles_dir, output_dir)
_logger.info("len relevant articles set: %d", num_articles)
# save trimmed down metadata.json to output directory
with open(os.path.join(output_dir, "metadata.json"), "w") as fo:
json.dump(metadata, fo, indent=2)
def train_test_split_phase2(
metadata_file, articles_dir, train_dir, test_dir, train_size, random_state
):
from sklearn.model_selection import train_test_split
if os.path.exists(train_dir):
raise ValueError("train_dir ({}) already exists".format(train_dir))
if os.path.exists(test_dir):
raise ValueError("test_dir ({}) already exists".format(test_dir))
os.makedirs(os.path.join(train_dir, "articles"))
os.makedirs(os.path.join(test_dir, "articles"))
with open(metadata_file, "r") as fi:
metadata = json.load(fi)
# log args
_logger.info("metadata_file: %s", metadata_file)
_logger.info("articles_dir: %s", articles_dir)
_logger.info("train_dir: %s", train_dir)
_logger.info("test_dir: %s", test_dir)
_logger.info("train_size: %.2f", train_size)
_logger.info("random_state: %d", random_state)
_logger.info("")
# train_test_split
all_labels = [d["label"] for d in metadata]
training_data, testing_data, _, _ = train_test_split(
metadata,
all_labels,
stratify=all_labels,
train_size=train_size,
random_state=random_state,
)
# logging train test split
_logger.info("Num Total Claims: %d", len(metadata))
_logger.info("Num Train Claims: %d", len(training_data))
_logger.info("Num Test Claims: %d", len(testing_data))
_logger.info("")
all_labels_count = collections.Counter(all_labels)
train_labels_count = collections.Counter([d["label"] for d in training_data])
test_labels_count = collections.Counter([d["label"] for d in testing_data])
_logger.info("All Labels Count: %s", str(dict(all_labels_count)))
_logger.info("Train Labels Count: %s", str(dict(train_labels_count)))
_logger.info("Test Labels Count: %s", str(dict(test_labels_count)))
_logger.info("")
# articles
num_total_articles = len(glob.glob(os.path.join(articles_dir, "*.html")))
_logger.info("Num Total Articles: %d", num_total_articles)
num_train_articles = _save_relevant_articles_phase2(
training_data, articles_dir, train_dir
)
_logger.info("Num Train Articles: %d", num_train_articles)
num_test_articles = _save_relevant_articles_phase2(
testing_data, articles_dir, test_dir
)
_logger.info("Num Test Articles: %d", num_test_articles)
# write metadata
with open(os.path.join(train_dir, "metadata.json"), "w") as fo:
json.dump(training_data, fo, indent=2)
with open(os.path.join(test_dir, "metadata.json"), "w") as fo:
json.dump(testing_data, fo, indent=2)
import os
import glob
import json
import logging
import datetime
import multiprocessing
import tldextract
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from valerie.data import Claim, Article, combine_claims
_logger = logging.getLogger(__name__)
####################
####### Base #######
####################
class ValerieDataset:
def __init__(self, claims, articles=None, setify=True):
if setify:
self.claims = list(set(claims))
_logger.info(
"%s claims set change %d --> %d",
self.__class__.__name__,
len(claims),
len(self.claims),
)
else:
self.claims = claims
_logger.info("len of claims: %d", len(self.claims))
if articles:
self.articles = list(set(articles))
_logger.info(
"%s articles set change %d --> %d",
self.__class__.__name__,
len(articles),
len(self.articles),
)
def train_test_split(self, **kwargs):
_logger.info("... performing train_test_split ...",)
_labels = [claim.label for claim in self.claims]
train_claims, test_claims, _, _ = train_test_split(
self.claims, _labels, stratify=_labels, **kwargs
)
self.train_claims = train_claims
self.test_claims = test_claims
_logger.info("len of all claims: %d", len(self.claims))
_logger.info("len of train claims: %d", len(self.train_claims))
_logger.info("len of test claims: %d", len(self.test_claims))
def train_test_split_subdataset(self, subdataset_name, **kwargs):
"""Train test split for a subdataset of a combined dataset.
Performs a train test split on the specified subdataset that's within the
current combined dataset. This is useful if you want all your test data
to only come from a single dataset, rather than all the combined ones.
"""
_logger.info(
"... performing train_test_split_subdataset on subdataset %s ...",
subdataset_name,
)
sub_claims = [
claim for claim in self.claims if claim.dataset_name == subdataset_name
]
not_sub_claims = [
claim for claim in self.claims if claim.dataset_name != subdataset_name
]
sub_labels = [claim.label for claim in sub_claims]
train_claims, test_claims, _, _ = train_test_split(
sub_claims, sub_labels, stratify=sub_labels, **kwargs
)
self.train_claims = train_claims + not_sub_claims
self.test_claims = test_claims
_logger.info("len of all claims: %d", len(self.claims))
_logger.info("len of train claims: %d", len(self.train_claims))
_logger.info("len of test claims: %d", len(self.test_claims))
@classmethod
def df_to_claims(cls, df, row_to_claim):
claims = []
misses = 0
for i, row in tqdm(
df.iterrows(), total=len(df), desc="{} to claims".format(cls.__name__)
):
# if phase1/phase2, do not try/except an error when parsing df
if cls.__name__ in [
Phase1Dataset.__name__,
Phase1Dataset.__name__,
Phase2TrialDataset.__name__,
]:
claims.append(row_to_claim(i, row))
else:
try:
claims.append(row_to_claim(i, row))
except:
misses += 1
continue
_logger.info("missed row to claim conversions: %d", misses)
return claims
@classmethod
def from_raw(cls):
raise NotImplementedError
@classmethod
def from_all_filter(cls, claims, articles=None):
_claims = [c for c in claims if c.dataset_name == cls.__name__]
if articles:
_articles = [a for a in articles if a.dataset_name == cls.__name__]
return cls(_claims, _articles)
####################
##### Internal #####
####################
class Phase1Dataset(ValerieDataset):
@classmethod
def from_raw(
cls, metadata_file="data/phase1/raw/metadata.json", articles_dir=None, nproc=1
):
# because pandas logging sucks
if not os.path.isfile(metadata_file):
raise ValueError(
"metadata file {} was not found or is not file".format(metadata_file)
)
if articles_dir and not os.path.isdir(articles_dir):
raise ValueError(
"articles dir {} was not found or is not dir".format(articles_dir)
)
df = pd.read_json(metadata_file)
claims = cls.df_to_claims(df, cls.row_to_claim)
articles = None
if articles_dir:
articles = cls.articles_from_phase1(articles_dir, nproc)
return cls(claims, articles)
@classmethod
def row_to_claim(cls, i, row):
row = dict(row)
_id = row.pop("id")
# only parse related articles if it exists
# (we do this check since related_articles is a removed field for the eval)
related_articles = {}
if "related_articles" in row:
for rel_art in row.pop("related_articles"):
rel_art = cls.__name__ + "/" + str(rel_art) + ".txt"
related_articles[rel_art] = rel_art
return Claim(
_id, related_articles=related_articles, dataset_name=cls.__name__, **row
)
@staticmethod
def articles_from_phase1(articles_dir, nproc=1):
fpaths = glob.glob(os.path.join(articles_dir, "*.txt"))
pool = multiprocessing.Pool(nproc)
articles = []
for article in tqdm(
pool.imap_unordered(_articles_from_phase1_visit, fpaths),
total=len(fpaths),
desc="loading articles from phase1",
):
articles.append(article)
return articles
def _articles_from_phase1_visit(fpath):
with open(fpath, encoding="utf8") as fi:
art_id = os.path.basename(fpath)
article = Article.from_txt(
art_id, fi.read(), dataset_name=Phase1Dataset.__name__
)
return article
class Phase2Dataset(ValerieDataset):
@classmethod
def from_raw(
cls,
metadata_file="data/phase2-3/raw/metadata.json",
articles_dir=None,
nproc=1,
setify=True,
):
# because pandas logging sucks
if not os.path.isfile(metadata_file):
raise ValueError(
"metadata file {} was not found or is not file".format(metadata_file)
)
if articles_dir and not os.path.isdir(articles_dir):
raise ValueError(
"articles dir {} was not found or is not dir".format(articles_dir)
)
df = pd.read_json(metadata_file)
claims = cls.df_to_claims(df, cls.row_to_claim)
articles = None
if articles_dir:
articles = cls.articles_from_phase2(articles_dir, claims, nproc=nproc)
return cls(claims, articles, setify=setify)
@classmethod
def row_to_claim(cls, i, row):
row = dict(row)
_id = row.pop("id")
# only parse related articles if it exists
# (we do this check since related_articles is a removed field for the eval)
related_articles = {}
if "related_articles" in row:
for k, v in row.pop("related_articles").items():
rel_art = cls.__name__ + "/" + os.path.basename(k)
related_articles[rel_art] = v
return Claim(
_id, related_articles=related_articles, dataset_name=cls.__name__, **row
)
@staticmethod
def articles_from_phase2(articles_dir, claims, nproc=1):
fpaths = glob.glob(os.path.join(articles_dir, "*.html"))
pool = multiprocessing.Pool(nproc)
articles = []
for article in tqdm(
pool.imap_unordered(_articles_from_phase2_visit, fpaths),
total=len(fpaths),
desc="loading article from phase2",
):
articles.append(article)
# fetch the urls for each article from the claims and perform tldextract
misses = 0
art_index_to_url = {
k: v for claim in claims for k, v in claim.related_articles.items()
}
for article in articles:
try:
article.url = art_index_to_url[article.index]
except:
misses += 1
continue
article.source = tldextract.extract(article.url).domain
_logger.info("missed art index to url conversions: %d", misses)
return articles
def _articles_from_phase2_visit(fpath):
with open(fpath, encoding="utf8") as fi:
art_id = os.path.basename(fpath)
article = Article.from_html(
art_id, fi.read(), dataset_name=Phase2Dataset.__name__
)
return article
class Phase2DisjointDataset(Phase2Dataset):
@classmethod
def from_raw(cls, unlabelled_metadata_file, labelled_metadata_file):
with open(unlabelled_metadata_file) as fi:
trial_metadata_unlabelled = json.load(fi)
with open(labelled_metadata_file) as fi:
trial_labels = json.load(fi)
trial_metadata = [
{
**claim,
"label": trial_labels[str(claim["id"])]["label"],
"related_articles": trial_labels[str(claim["id"])]["related_articles"],
}
for claim in trial_metadata_unlabelled
]
df = pd.DataFrame(trial_metadata)
claims = cls.df_to_claims(df, cls.row_to_claim)
return cls(claims)
class Phase2TrialDataset(Phase2DisjointDataset):
@classmethod
def from_raw(
cls,
unlabelled_metadata_file="data/phase2-trial/raw/2_trial_metadata.json",
labelled_metadata_file="data/phase2-trial/raw/2_trial_labels.json",
):
return super().from_raw(
unlabelled_metadata_file=unlabelled_metadata_file,
labelled_metadata_file=labelled_metadata_file,
)
class Phase2Validation100Dataset(Phase2DisjointDataset):
@classmethod
def from_raw(
cls,
unlabelled_metadata_file="data/phase2-validation-100/raw/metadata.json",
labelled_metadata_file="data/phase2-validation-100/raw/labels.json",
):
return super().from_raw(
unlabelled_metadata_file=unlabelled_metadata_file,
labelled_metadata_file=labelled_metadata_file,
)
class Phase2Validation500Dataset(Phase2DisjointDataset):
@classmethod
def from_raw(
cls,
unlabelled_metadata_file="data/phase2-validation-500/raw/val_metadata_p2.json",
labelled_metadata_file="data/phase2-validation-500/raw/2_labels.json",
):
return super().from_raw(
unlabelled_metadata_file=unlabelled_metadata_file,
labelled_metadata_file=labelled_metadata_file,
)
####################
##### External #####
####################
class FakeNewsTop50Dataset(ValerieDataset):
"""https://github.com/BuzzFeedNews/2018-12-fake-news-top-50.git"""
@classmethod
def from_raw(
cls,
top_csv="data/external/2018-12-fake-news-top-50/data/top_2018.csv",
sites_csvs=[
"data/external/2018-12-fake-news-top-50/data/sites_2016.csv",
"data/external/2018-12-fake-news-top-50/data/sites_2017.csv",
"data/external/2018-12-fake-news-top-50/data/sites_2018.csv",
],
):
df = pd.read_csv(top_csv)
sites = []
for sites_csv in sites_csvs:
with open(sites_csv) as fi:
sites += fi.read().splitlines()
sites = list(set(sites))
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
dataset.sites = sites
return dataset
@classmethod
def row_to_claim(cls, i, row):
# TODO: consider lowercasing the input claim (all words
# start with capital currently)
return Claim(
str(i),
claim=row["title"],
date=row["published_date"],
claimant="Facebook user",
label=0,
dataset_name=cls.__name__,
)
class FakeNewsKaggleDataset(ValerieDataset):
"""https://www.kaggle.com/c/fake-news/"""
@classmethod
def from_raw(cls, train_csv="data/external/fake-news/train.csv"):
df = pd.read_csv(train_csv)
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
# label 0 for reliable
# label 1 for unreliable
return Claim(
str(i),
claim=row["title"],
claimant=row["author"],
label=0 if row["label"] else 2,
dataset_name=cls.__name__,
)
class FakeNewsNetDataset(ValerieDataset):
"""https://github.com/KaiDMML/FakeNewsNet.git"""
@classmethod
def from_raw(
cls,
politifact_fake_csv="data/external/FakeNewsNet/dataset/politifact_fake.csv",
politifact_real_csv="data/external/FakeNewsNet/dataset/politifact_real.csv",
gossipcop_fake_csv="data/external/FakeNewsNet/dataset/gossipcop_fake.csv",
gossipcop_real_csv="data/external/FakeNewsNet/dataset/gossipcop_real.csv",
name="fake_news_net",
):
df = pd.concat(
[
pd.read_csv(politifact_fake_csv).assign(label=0),
pd.read_csv(politifact_real_csv).assign(label=2),
pd.read_csv(gossipcop_fake_csv).assign(label=0),
pd.read_csv(gossipcop_real_csv).assign(label=2),
],
ignore_index=True,
)
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
return Claim(
str(i),
claim=row["title"],
claimant=tldextract.extract(row["news_url"]).domain,
label=row["label"],
dataset_name=cls.__name__,
)
class GeorgeMcIntireDataset(ValerieDataset):
"""https://github.com/GeorgeMcIntire"""
@classmethod
def from_raw(cls, data_csv="data/external/george-mcintire/fake_or_real_news.csv"):
df = pd.read_csv(data_csv, skiprows=1, names=["id", "title", "text", "label"])
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
return Claim(
str(i),
claim=row["title"],
label=0 if row["label"] == "FAKE" else 1,
dataset_name=cls.__name__,
)
class ISOTDataset(ValerieDataset):
"""https://www.uvic.ca/engineering/ece/isot/datasets/"""
@classmethod
def from_raw(
cls,
fake_csv="data/external/ISOT/Fake.csv",
true_csv="data/external/ISOT/True.csv",
):
df = pd.concat(
[
pd.read_csv(fake_csv).assign(label=0),
pd.read_csv(true_csv).assign(label=2),
],
ignore_index=True,
)
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
try: # December 31, 2017
_date = datetime.datetime.strptime(row["date"], "%B %d, %Y")
except: # 19-Feb-18
try:
_date = datetime.datetime.strptime(row["date"], "%d-%b-%y")
except: # Dec 31, 2017
try:
_date = datetime.datetime.strptime(row["date"], "%b %d, %Y")
except:
_date = None
return Claim(
str(i),
claim=row["title"],
date=_date.strftime("%Y-%m-%d") if _date else None,
label=row["label"],
dataset_name=cls.__name__,
)
class LiarDataset(ValerieDataset):
"""https://www.cs.ucsb.edu/~william/data/liar_dataset.zip"""
@classmethod
def from_raw(cls, data_tsv="data/external/liar/train.tsv"):
df = pd.read_csv(
data_tsv,
sep="\t",
names=[
"id",
"label",
"statement",
"subject(s)",
"speaker",
"speaker's job title",
"state info",
"party affiliation",
"total credit history count",
"barely true counts",
"false counts",
"half true counts",
"mostly true counts",
"context (venue/location of speech or statement)",
"pants on fire counts",
],
)
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
if row["label"] == "false":
_lab = 0
elif row["label"] == "true":
_lab = 2
else:
_lab = 1
return Claim(
str(i),
claim=row["statement"],
claimant=row["speaker"] if isinstance(row["speaker"], str) else None,
label=_lab,
dataset_name=cls.__name__,
)
class MrisdalDataset(ValerieDataset):
"""https://www.kaggle.com/mrisdal/fake-news"""
@classmethod
def from_raw(cls, data_csv="data/external/mrisdal/fake.csv"):
df = pd.read_csv(data_csv)
dataset = cls(cls.df_to_claims(df, cls.row_to_claim))
dataset.df = df
return dataset
@classmethod
def row_to_claim(cls, i, row):
if row["ord_in_thread"] != 0:
raise ValueError("must be main post")
return Claim(
str(i),
claim=row["title"],
claimant=row["site_url"],
date=datetime.datetime.strptime(
row["published"].split("T")[0], "%Y-%m-%d"
).strftime("%Y-%m-%d"),
label=0,
dataset_name=cls.__name__,
)
####################
##### Combined #####
####################
class LeadersDataset(ValerieDataset):
@classmethod
def from_raw(cls):
datasets = [
Phase2Dataset.from_raw(),
Phase1Dataset.from_raw(),
]
assert isinstance(datasets[0], Phase2Dataset)
return cls(combine_datasets_claims(datasets))
class Phase2CombinedDataset(ValerieDataset):
@classmethod
def from_raw(
cls, datasets=[],
):
datasets = [Phase2Dataset.from_raw()] + [
dataset.from_raw() for dataset in datasets
]
assert isinstance(datasets[0], Phase2Dataset)
return cls(combine_datasets_claims(datasets))
class CombinedDataset(ValerieDataset):
@classmethod
def from_raw(
cls,
datasets=[
Phase2Dataset,
Phase1Dataset,
FakeNewsTop50Dataset,
FakeNewsKaggleDataset,
FakeNewsNetDataset,
GeorgeMcIntireDataset,
ISOTDataset,
LiarDataset,
MrisdalDataset,
],
):
datasets = [dataset.from_raw() for dataset in datasets]
return cls(combine_datasets_claims(datasets))
def combine_datasets_claims(datasets):
claims_lists = [dataset.claims for dataset in datasets]
logging_names = [dataset.__class__.__name__ for dataset in datasets]
return combine_claims(claims_lists, logging_names=logging_names)
name_to_dataset = {
Phase1Dataset.__name__: Phase1Dataset,
Phase2Dataset.__name__: Phase2Dataset,
Phase2DisjointDataset.__name__: Phase2DisjointDataset,
Phase2TrialDataset.__name__: Phase2TrialDataset,
Phase2Validation100Dataset.__name__: Phase2Validation100Dataset,
Phase2Validation500Dataset.__name__: Phase2Validation500Dataset,
FakeNewsTop50Dataset.__name__: FakeNewsTop50Dataset,
FakeNewsKaggleDataset.__name__: FakeNewsKaggleDataset,
FakeNewsNetDataset.__name__: FakeNewsNetDataset,
GeorgeMcIntireDataset.__name__: GeorgeMcIntireDataset,
ISOTDataset.__name__: ISOTDataset,
LiarDataset.__name__: LiarDataset,
MrisdalDataset.__name__: MrisdalDataset,
LeadersDataset.__name__: LeadersDataset,
Phase2CombinedDataset.__name__: Phase2CombinedDataset,
CombinedDataset.__name__: CombinedDataset,
}
"""Preprocessing."""
import re
import logging
import unicodedata
import wordninja
_logger = logging.getLogger(__name__)
def extract_words_from_url(url):
"""Extracts words from a url.
Example
-------
input: https://www.berkeleyschools.net/departments/public-information-office/
output: berkeley schools departments public information office
"""
remove = {
"www",
"html",
"index",
"htm",
"http:",
"https:",
"http",
"https",
"com",
"ca",
"gov",
"org",
"net",
"co",
}
words = [
clean_text(w, remove_punctuation=True)
for w in split(url, [".", "-", "/", "?", "=", "&"])
]
words = [w for word in words for w in wordninja.split(word)]
words = [
word
for word in words
if word and word not in remove and not word.isnumeric() and len(word) > 2
]
return words
def split(string, delimiters):
"""Split a string using multiple delimiters."""
regexPattern = "|".join(map(re.escape, delimiters))
return re.split(regexPattern, string)
def clean_text(text, remove_punctuation=False):
"""Cleans the text of whitespace and control chars."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xFFFD or _is_control(char):
continue
if remove_punctuation and _is_punctuation(char):
continue
if _is_whitespace(char):
if len(output) > 0 and output[-1] == " ":
continue
output.append(" ")
else:
output.append(char)
return "".join(output).strip()
def _is_punctuation(char):
cp = ord(char)
if (
(cp >= 33 and cp <= 47)
or (cp >= 58 and cp <= 64)
or (cp >= 91 and cp <= 96)
or (cp >= 123 and cp <= 126)
):
return True
cat = unicodedata.category(char)
if cat.startswith("P"):
return True
return False
def _is_whitespace(char):
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
def _is_control(char):
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat in ("Cc", "Cf"):
return True
return False
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment