Skip to content

Instantly share code, notes, and snippets.

Avatar

Mike Casale mikewcasale

View GitHub Profile
@mikewcasale
mikewcasale / narcissus_pr.py
Created Mar 26, 2020
narcissus print check
View narcissus_pr.py
nq_task = t5.data.TaskRegistry.get("narcissus")
ds = nq_task.get_dataset(split="valid", sequence_length={"inputs": 512, "targets": 512})
for ex in tfds.as_numpy(ds.take(5)):
print(ex)
View narcissus.py
t5.data.TaskRegistry.remove('narcissus')
t5.data.TaskRegistry.add(
"narcissus",
# Supply a function which returns a tf.data.Dataset.
dataset_fn=ds_func,
splits=["train", "valid"],
# Supply a function which preprocesses text from the tf.data.Dataset.
text_preprocessor=[,
lambda sample: t5.data.preprocessors.prefix_lm(sample, label='article: ')
],
View ds_func.py
def ds_func(split, shuffle_files=False):
del shuffle_files
ds = tf.data.TextLineDataset(os.path.join(DATA_DIR, 'train.txt'))
ds = ds.map(lambda ex: dict(text=(ex, print(ex))[0]))
return ds
for ex in tfds.as_numpy(ds_func("valid").take(5)):
print(ex)
@mikewcasale
mikewcasale / cleanup.py
Created Mar 26, 2020
cleanup NLP input
View cleanup.py
import re
from tqdm import tqdm_notebook as tqdm
for split in data:
with tf.io.gfile.GFile(os.path.join(DATA_DIR, split+'.txt'), 'w') as g:
for fn in tqdm(data[split]):
with open(fn, errors='ignore') as f:
text = f.read()
text = text.replace('\n', ' ').replace('\t', ' ')
ans = re.sub(' +', ' ', text)
View globit.py
import glob
import numpy as np
files = glob.glob('./codedata/*/*', recursive=True)
# Split files into test/train set
np.random.seed(1000) # For reproducability
np.random.shuffle(files)
N = int(float(len(files))*0.8) # Do an 80-20 split for training/validation
data = dict(
View t5_colab_pt1.py
# TODO(adarob): Add support for 2.x.
# %tensorflow_version 1.x
import datetime
import functools
import json
import os
import pprint
import random
import string
@mikewcasale
mikewcasale / xls_to_csv_etractor.py
Last active Mar 26, 2020
extracts data from xls file and creates new csv
View xls_to_csv_etractor.py
import os
import pandas as pd
import xlrd
def createDirs():
# Directory
directories = ["./codedata","./codedata/StudyTitle","./codedata/PressReleaseTitle"]
for directory in directories:
try:
# Create the directory
You can’t perform that action at this time.