Skip to content

Instantly share code, notes, and snippets.

@micimize
Created December 10, 2017 19:40
Show Gist options
  • Save micimize/170749a5e4729adf22cc710eb11ff5e8 to your computer and use it in GitHub Desktop.
Save micimize/170749a5e4729adf22cc710eb11ff5e8 to your computer and use it in GitHub Desktop.
example of flattening a data corpus with a semantically relevant directory structure to a single directory of json files
import glob, json, os
def makedirs_p(dir_name):
if not os.path.exists(dir_name):
os.makedirs(dir_name)
def contents_of(file_name):
with open(file_name, 'r', encoding='utf8') as f:
return f.read()
def corpus_annotations(corpus_name, path):
path = path.split('/')
corpus_id = path[-1]
corpus_class = path[-2]
partition = path[-3].split('-')[-1]
return [ corpus_name, corpus_class, partition, corpus_id ], {
"name": corpus_name,
"partition": partition,
"class": corpus_class,
}
def flatten(corpus_name, path):
identifiers, annotations = corpus_annotations(corpus_name, path)
return '_'.join(identifiers[1:]) + '.json', {
"_id": '/'.join(identifiers),
"@annotations": { "corpus": annotations },
"text": contents_of(path)
}
def flatten_corpus(source='./corpuses/20news-bydate/*/*/*', target='./corpuses/newsgroup-20-docs'):
makedirs_p(target)
for path in glob.glob(source):
flat_name, data = flatten('newsgroup-20', path)
with open(target + '/' + flat_name, 'w', encoding='utf8') as f:
json.dump(data, f, ensure_ascii=False)
if __name__ == "__main__":
flatten_corpus()
@micimize
Copy link
Author

sample output from 20news-bydate/20news-bydate-train/rec.sport.baseball/104530, which is munged to ./corpuses/newsgroup-20-docs/rec.sport.baseball_train_104530.json

{
    "@annotations": {
        "corpus": {
            "class": "rec.sport.baseball",
            "name": "newsgroup-20",
            "partition": "train"
        }
    },
    "_id": "newsgroup-20/rec.sport.baseball/train/104530",
    "text": "From: pablo@jhunix.hcf.jhu.edu (Pablo A Iglesias)\nSubject: Re: Jewish Baseball Players?\nOrganization: Homewood Academic Computing, Johns Hopkins University, Baltimore, Md, USA\nLines: 24\nNNTP-Posting-Host: jhunix.hcf.jhu.edu\n\nIn article <15APR93.14691229.0062@lafibm.lafayette.edu> VB30@lafibm.lafayette.edu (VB30) writes:\n>Just wondering.A friend and I were talking the other day, and\n>we were (for some reason) trying to comeup with names of Jewish\n>baseball players, past and present.  We weren't able to come up\n>with much, except for Sandy Koufax, (somebody) Stankowitz, and\n>maybe John Lowenstein. Can anyone come up with any more.  I know\n>it sounds pretty lame to be racking our brains over this, but\n>humor us.  Thanks for your help.\n>\n>Thanks.\n>Bobby\n\n\n\nHank Greenberg would have to be the most famous, because his Jewish\nfaith actually affected his play. (missing late season or was it world\nseries games because of Yom Kippur)\n\n\n\n-- \nPablo Iglesias                        \npi@ruth.ece.jhu.edu\n\n"
}

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment