Skip to content

Instantly share code, notes, and snippets.

@mbforbes
Created June 18, 2021 18:30
Show Gist options
  • Save mbforbes/9a05be725587644443e5fe3a1f0b26b8 to your computer and use it in GitHub Desktop.
Save mbforbes/9a05be725587644443e5fe3a1f0b26b8 to your computer and use it in GitHub Desktop.
scruples preprocess
"""Compare preprocessed scruples to original titles.
Usage:
python -m sc.scripts.from_scruples
"""
import code # code.interact(local=dict(globals(), **locals()))
import json
import os
from typing import List, Tuple, Set, Dict, Any, Optional, NamedTuple, Iterator, Callable
from mbforbes_python_utils import read
import pandas as pd
from tqdm import tqdm
from sc.scrape.person_recognizer import tokenize
def main() -> None:
pattern = "data/scruples-corpus/{}.scruples-corpus.jsonl"
splits = ["train", "dev", "test"]
res: List[Dict[str, Any]] = []
total, skipped = 0, 0
for split in splits:
lines = read(pattern.format(split)).split("\n")
total += len(lines)
for line in tqdm(lines):
data = json.loads(line)
if (
"post_id" in data
and "action" in data
and data["action"] is not None
and "description" in data["action"]
):
res.append(
{
"subreddit": "amitheasshole",
"reddit-id": data["post_id"],
"cleaned": data["action"]["description"],
"tokenized": " ".join(tokenize(data["action"]["description"])),
"orig": data["title"],
"meta": json.dumps({"scruples-split": split}),
}
)
else:
skipped += 1
print(f"Skipped {skipped}/{total} ({100*skipped/total:.2f}%)")
os.makedirs("data/sentences/", exist_ok=True)
pd.DataFrame(res).to_csv(
"data/sentences/scruples-processed.v0.3.tsv", sep="\t", index=False
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment