Skip to content

Instantly share code, notes, and snippets.

@enewe101
Created March 20, 2017 20:31
Show Gist options
  • Save enewe101/ae1776ef74b21710076847b7a798ee76 to your computer and use it in GitHub Desktop.
Save enewe101/ae1776ef74b21710076847b7a798ee76 to your computer and use it in GitHub Desktop.
import json
from cluster_func import RC_PARAMS
import math
import sys
sys.path.append('..')
from LOCAL_SETTINGS import DATA_DIR, SITES
import corenlpy
import os
import re
import t4k
BATCH_SIZE = 2000.0
INPUT_ARTICLES_DIRNAME = 'article-text-utf8'
cluf_options = {
'pbs_options' : {
'ppn': 12,
'walltime': '12:00:00',
'pmem': '5799m'
},
'iterations': 1,
'processes': 1,
'prepend_statements': (
['module load Java/1.8.0_45'] + RC_PARAMS['prepend_statements']
),
'jobs_dir': '../jobs'
}
def determine_number_of_bins():
num_bins = {}
for site in SITES:
site_dir = os.path.join(DATA_DIR, site)
in_dir = os.path.join(site_dir, INPUT_ARTICLES_DIRNAME)
all_files = t4k.ls(in_dir, dirs=False, basename=True)
out_dir = os.path.join(site_dir, 'corenlp')
# Get files that were already done
try:
done_files = set(t4k.ls(out_dir, dirs=False, basename=True))
except OSError:
done_files = set()
# Filter files already done
filtered_files = [
os.path.join(in_dir, f) for f in all_files
if (f + '.xml') not in done_files
]
# figure out how many bins are needed
site_num_bins = int(math.ceil(len(filtered_files) / BATCH_SIZE))
num_bins[site] = site_num_bins
open('corenlp-bins.json', 'w').write(json.dumps(num_bins, indent=2))
print json.dumps(num_bins, indent=2)
def args():
num_bins = json.loads(open('corenlp-bins.json').read())
for site in SITES:
# Skip sites that have no bins
if num_bins[site] == 0:
continue
site_dir = os.path.join(DATA_DIR, site)
in_dir = os.path.join(site_dir, INPUT_ARTICLES_DIRNAME)
out_dir = os.path.join(site_dir, 'corenlp')
all_files = t4k.ls(in_dir, dirs=False, basename=True)
# Filter out files that have already been processed, based on the fact
# that a corresponding output file already exists
try:
done_files = set(t4k.ls(out_dir, dirs=False, basename=True))
except OSError:
done_files = set()
filtered_files = [
os.path.join(in_dir, f) for f in all_files
if (f + '.xml') not in done_files
]
# Put files into prescribed number of bins
file_groups = [[] for i in range(num_bins[site])]
for file in filtered_files:
idx = t4k.binify(file, num_bins[site])
file_groups[idx].append(file)
for file_group in file_groups:
yield (file_group, out_dir)
def target(in_files, out_dir):
annotators = [
'tokenize', 'ssplit', 'pos', 'lemma', 'ner', 'parse', 'dcoref']
properties = {'ner.model': (
'edu/stanford/nlp/models/ner/english.conll.'
'4class.distsim.crf.ser.gz')}
t4k.ensure_exists(out_dir)
corenlpy.corenlp(
in_files=in_files, out_dir=out_dir, threads=12,
annotators=annotators, properties=properties
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment