Created
March 20, 2017 20:31
-
-
Save enewe101/ae1776ef74b21710076847b7a798ee76 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from cluster_func import RC_PARAMS | |
import math | |
import sys | |
sys.path.append('..') | |
from LOCAL_SETTINGS import DATA_DIR, SITES | |
import corenlpy | |
import os | |
import re | |
import t4k | |
BATCH_SIZE = 2000.0 | |
INPUT_ARTICLES_DIRNAME = 'article-text-utf8' | |
cluf_options = { | |
'pbs_options' : { | |
'ppn': 12, | |
'walltime': '12:00:00', | |
'pmem': '5799m' | |
}, | |
'iterations': 1, | |
'processes': 1, | |
'prepend_statements': ( | |
['module load Java/1.8.0_45'] + RC_PARAMS['prepend_statements'] | |
), | |
'jobs_dir': '../jobs' | |
} | |
def determine_number_of_bins(): | |
num_bins = {} | |
for site in SITES: | |
site_dir = os.path.join(DATA_DIR, site) | |
in_dir = os.path.join(site_dir, INPUT_ARTICLES_DIRNAME) | |
all_files = t4k.ls(in_dir, dirs=False, basename=True) | |
out_dir = os.path.join(site_dir, 'corenlp') | |
# Get files that were already done | |
try: | |
done_files = set(t4k.ls(out_dir, dirs=False, basename=True)) | |
except OSError: | |
done_files = set() | |
# Filter files already done | |
filtered_files = [ | |
os.path.join(in_dir, f) for f in all_files | |
if (f + '.xml') not in done_files | |
] | |
# figure out how many bins are needed | |
site_num_bins = int(math.ceil(len(filtered_files) / BATCH_SIZE)) | |
num_bins[site] = site_num_bins | |
open('corenlp-bins.json', 'w').write(json.dumps(num_bins, indent=2)) | |
print json.dumps(num_bins, indent=2) | |
def args(): | |
num_bins = json.loads(open('corenlp-bins.json').read()) | |
for site in SITES: | |
# Skip sites that have no bins | |
if num_bins[site] == 0: | |
continue | |
site_dir = os.path.join(DATA_DIR, site) | |
in_dir = os.path.join(site_dir, INPUT_ARTICLES_DIRNAME) | |
out_dir = os.path.join(site_dir, 'corenlp') | |
all_files = t4k.ls(in_dir, dirs=False, basename=True) | |
# Filter out files that have already been processed, based on the fact | |
# that a corresponding output file already exists | |
try: | |
done_files = set(t4k.ls(out_dir, dirs=False, basename=True)) | |
except OSError: | |
done_files = set() | |
filtered_files = [ | |
os.path.join(in_dir, f) for f in all_files | |
if (f + '.xml') not in done_files | |
] | |
# Put files into prescribed number of bins | |
file_groups = [[] for i in range(num_bins[site])] | |
for file in filtered_files: | |
idx = t4k.binify(file, num_bins[site]) | |
file_groups[idx].append(file) | |
for file_group in file_groups: | |
yield (file_group, out_dir) | |
def target(in_files, out_dir): | |
annotators = [ | |
'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref'] | |
properties = {'ner.model': ( | |
'edu/stanford/nlp/models/ner/english.conll.' | |
'4class.distsim.crf.ser.gz')} | |
t4k.ensure_exists(out_dir) | |
corenlpy.corenlp( | |
in_files=in_files, out_dir=out_dir, threads=12, | |
annotators=annotators, properties=properties | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment