Skip to content

Instantly share code, notes, and snippets.

@ashokc
Last active January 16, 2019 01:05
Show Gist options
  • Save ashokc/357f003f0c2f9017f22e5329f93481b3 to your computer and use it in GitHub Desktop.
Save ashokc/357f003f0c2f9017f22e5329f93481b3 to your computer and use it in GitHub Desktop.
Construct a Text Corpus with Sequences
# Build the corpus and sequences
with open ('words.txt' , 'r') as f:
words = sorted(list(set(f.read().lower().strip().split(','))))
X, labels = [], []
labelToName = { 0 : 'ordered', 1 : 'unordered', 2 : 'reversed' }
namesInLabelOrder = ['ordered', 'unordered', 'reversed']
nWords = len(words)
sequenceLength=15
for i in range(0, nWords-sequenceLength):
X.append(words[i:i+sequenceLength])
labels.append(0)
for i in range(nWords-sequenceLength, nWords):
X.append(words[i:nWords] + words[0:sequenceLength + i -nWords])
labels.append(0)
nSegments = len(X)
for i in range(nSegments):
X.append(X[i][::-1])
labels.append(1)
for i in range(nSegments):
randIndices = np.random.randint(0, size=sequenceLength, high=nWords)
X.append(list( words[i] for i in randIndices ))
labels.append(2)
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=1).split(X, labels)
train_indices, test_indices = next(sss)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment