Skip to content

Instantly share code, notes, and snippets.

@surmenok
Created November 6, 2016 21:08
Show Gist options
  • Save surmenok/2224ccfff5fbf24f3905b3da995668a3 to your computer and use it in GitHub Desktop.
Save surmenok/2224ccfff5fbf24f3905b3da995668a3 to your computer and use it in GitHub Desktop.
# Copyright 2014 Google Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Script for downloading and generating question/answer pairs.
"""
import argparse
from collections import namedtuple
import hashlib
from itertools import chain
from itertools import izip
from itertools import repeat
import math
from multiprocessing.pool import Pool
from multiprocessing.pool import ThreadPool
import os
import re
import sys
import time
import cchardet as chardet
from lxml import html
import requests
import socket
class Story(namedtuple('StoryBase', 'url content highlights title')):
def ToString(self):
return self.title + '\n' + self.content + ''.join([
'\n\n@highlight\n\n' + highlight
for highlight in
self.highlights])
AnonymizedStory = namedtuple(
'AnonymizedStory', 'url content highlights anonymization_info')
RawStory = namedtuple('RawStory', 'url html')
TokenizedStory = namedtuple('TokenizedStory', 'url tokens')
class QuestionContext(
namedtuple(
'QuestionContextBase',
'url context question answer anonymization_info')):
def ToString(self):
return '%s\n\n%s\n\n%s\n\n%s\n\n%s' % (
self.url, self.context, self.question, self.answer,
'\n'.join(
[
key + ':' + value
for key, value in self.anonymization_info.iteritems()]))
def ReadUrls(filename):
"""Reads a list of URLs.
Args:
filename: The filename containing the URLs.
Returns:
A list of URLs.
"""
with open(filename) as f:
return [line.strip('\n') for line in f]
def ReadMultipleUrls(filename):
"""Reads a list of URL lists.
Each line in the filename should contain a list of URLs separated by comma.
Args:
filename: The filename containing the URLs.
Returns:
A list of list of URLs.
"""
with open(filename) as f:
return [line.strip('\n').split(',') for line in f]
def WriteUrls(filename, urls):
"""Writes a list of URLs to a file.
Args:
filename: The filename to the file where the URLs should be written.
urls: The list of URLs to write.
"""
with open(filename, 'w') as f:
f.writelines(url + '\n' for url in urls)
def Hashhex(s):
"""Returns a heximal formated SHA1 hash of the input string.
Args:
s: The string to hash.
Returns:
A heximal formatted hash of the input string.
"""
h = hashlib.sha1()
h.update(s)
return h.hexdigest()
def ReadDownloadedUrl(url, corpus):
"""Reads a downloaded URL from disk.
Args:
url: The URL to read.
corpus: The corpus the URL belongs to.
Returns:
The content of the URL.
"""
try:
with open('%s/downloads/%s.html' % (corpus, Hashhex(url))) as f:
return f.read()
except IOError:
return None
wayback_pattern = re.compile(r'web/([^/]*)/')
def WaybackUrl(urls, max_attempts=6):
"""Retrieves the URL for the latest historic copy using Wayback Machine.
Args:
urls: The URL for a specific page (canonical URL + forwarding URL's).
max_attempts: The maximum attempts at requesting the URL.
Returns:
The URL or None if no copy is stored for the URL.
Raises:
RuntimeError: Failed to retrieve the URL.
"""
if not urls:
return None
url = urls[0]
index_collection_url = 'http://archive.org/wayback/available'
payload = {'url': url}
attempts = 0
while attempts < max_attempts:
try:
entry_req = requests.get(index_collection_url, params=payload,
allow_redirects=False)
if entry_req.status_code != requests.codes.ok:
return WaybackUrl(urls[1:], max_attempts)
entry = entry_req.json()
if 'closest' not in entry['archived_snapshots']:
return WaybackUrl(urls[1:], max_attempts)
wayback_url = entry['archived_snapshots']['closest']['url']
wayback_url = wayback_pattern.sub(r'web/\g<1>id_/', wayback_url, 1)
return wayback_url
except requests.exceptions.ConnectionError:
pass
# Exponential back-off.
time.sleep(math.pow(2, attempts))
attempts += 1
raise RuntimeError(
'Failed to download URL for %s after %d attempts. Please run the script '
'again.' %
(url, max_attempts))
def DownloadUrl(url, corpus, max_attempts=5, timeout=5):
"""Downloads a URL.
Args:
url: The URL.
corpus: The corpus of the URL.
max_attempts: Max attempts for downloading the URL.
timeout: Connection timeout in seconds for each attempt.
Returns:
The HTML at the URL or None if the request failed.
"""
try:
with open('%s/downloads/%s.html' % (corpus, Hashhex(url))) as f:
return f.read()
except IOError:
pass
attempts = 0
while attempts < max_attempts:
try:
req = requests.get(url, allow_redirects=False, timeout=timeout)
if req.status_code == requests.codes.ok:
content = req.text.encode(req.encoding)
with open('%s/downloads/%s.html' % (corpus, Hashhex(url)), 'w') as f:
f.write(content)
return content
elif (req.status_code in [301, 302, 404, 503]
and attempts == max_attempts - 1):
return None
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.ContentDecodingError:
return None
except requests.exceptions.ChunkedEncodingError:
return None
except requests.exceptions.Timeout:
pass
except socket.timeout:
pass
# Exponential back-off.
time.sleep(math.pow(2, attempts))
attempts += 1
return None
def ParseHtml(story, corpus):
"""Parses the HTML of a news story.
Args:
story: The raw Story to be parsed.
corpus: Either 'cnn' or 'dailymail'.
Returns:
A Story containing URL, paragraphs and highlights.
"""
parser = html.HTMLParser(encoding=chardet.detect(story.html)['encoding'])
tree = html.document_fromstring(story.html, parser=parser)
# Elements to delete.
delete_selectors = {
'cnn': [
'//blockquote[contains(@class, "twitter-tweet")]',
'//blockquote[contains(@class, "instagram-media")]'
],
'dailymail': [
'//blockquote[contains(@class, "twitter-tweet")]',
'//blockquote[contains(@class, "instagram-media")]'
]
}
# Paragraph exclusions: ads, links, bylines, comments
cnn_exclude = (
'not(ancestor::*[contains(@class, "metadata")])'
' and not(ancestor::*[contains(@class, "pullquote")])'
' and not(ancestor::*[contains(@class, "SandboxRoot")])'
' and not(ancestor::*[contains(@class, "twitter-tweet")])'
' and not(ancestor::div[contains(@class, "cnnStoryElementBox")])'
' and not(contains(@class, "cnnTopics"))'
' and not(descendant::*[starts-with(text(), "Read:")])'
' and not(descendant::*[starts-with(text(), "READ:")])'
' and not(descendant::*[starts-with(text(), "Join us at")])'
' and not(descendant::*[starts-with(text(), "Join us on")])'
' and not(descendant::*[starts-with(text(), "Read CNNOpinion")])'
' and not(descendant::*[contains(text(), "@CNNOpinion")])'
' and not(descendant-or-self::*[starts-with(text(), "Follow us")])'
' and not(descendant::*[starts-with(text(), "MORE:")])'
' and not(descendant::*[starts-with(text(), "SPOILER ALERT:")])')
dm_exclude = (
'not(ancestor::*[contains(@id,"reader-comments")])'
' and not(contains(@class, "byline-plain"))'
' and not(contains(@class, "byline-section"))'
' and not(contains(@class, "count-number"))'
' and not(contains(@class, "count-text"))'
' and not(contains(@class, "video-item-title"))'
' and not(ancestor::*[contains(@class, "column-content")])'
' and not(ancestor::iframe)')
paragraph_selectors = {
'cnn': [
'//div[contains(@class, "cnnContentContainer")]//p[%s]' % cnn_exclude,
'//div[contains(@class, "l-container")]//p[%s]' % cnn_exclude,
'//div[contains(@class, "cnn_strycntntlft")]//p[%s]' % cnn_exclude
],
'dailymail': [
'//div[contains(@class, "article-text")]//p[%s]' % dm_exclude
]
}
title_selectors = [
'//title'
]
# Highlight exclusions.
he = (
'not(contains(@class, "cnnHiliteHeader"))'
' and not(descendant::*[starts-with(text(), "Next Article in")])')
highlight_selectors = {
'cnn': [
'//*[contains(@class, "el__storyhighlights__list")]//li[%s]' % he,
'//*[contains(@class, "cnnStryHghLght")]//li[%s]' % he,
'//*[@id="cnnHeaderRightCol"]//li[%s]' % he
],
'dailymail': [
'//h1/following-sibling::ul//li'
]
}
title_exclusions = [ '- CNN.com', '| Mail Online', '| Daily Mail Online' ]
def ExtractText(selector):
"""Extracts a list of paragraphs given a XPath selector.
Args:
selector: A XPath selector to find the paragraphs.
Returns:
A list of raw text paragraphs with leading and trailing whitespace.
"""
xpaths = map(tree.xpath, selector)
elements = list(chain.from_iterable(xpaths))
paragraphs = [e.text_content().encode('utf-8') for e in elements]
# Remove editorial notes, etc.
if corpus == 'cnn' and len(paragraphs) >= 2 and '(CNN)' in paragraphs[1]:
paragraphs.pop(0)
paragraphs = map(str.strip, paragraphs)
paragraphs = [s for s in paragraphs if s and not str.isspace(s)]
return paragraphs
for selector in delete_selectors[corpus]:
for bad in tree.xpath(selector):
bad.getparent().remove(bad)
paragraphs = ExtractText(paragraph_selectors[corpus])
highlights = ExtractText(highlight_selectors[corpus])
titles = ExtractText(title_selectors)
title = titles[0] if len(titles) > 0 else ''
for title_exclusion in title_exclusions:
title = title.replace(title_exclusion, '')
title = title.strip()
content = '\n\n'.join(paragraphs)
return Story(story.url, content, highlights, title)
def WriteStory(story, corpus):
"""Writes a news story to disk.
Args:
story: The news story to write.
corpus: The corpus the news story belongs to.
"""
story_string = story.ToString()
url_hash = Hashhex(story.url)
with open('%s/stories/%s.story' % (corpus, url_hash), 'w') as f:
f.write(story_string)
def LoadTokenMapping(filename):
"""Loads a token mapping from the given filename.
Args:
filename: The filename containing the token mapping.
Returns:
A list of (start, end) where start and
end (inclusive) are offsets into the content for a token. The list is
sorted.
"""
mapping = []
with open(filename) as f:
line = f.readline().strip()
for token_mapping in line.split(';'):
if not token_mapping:
continue
start, length = token_mapping.split(',')
mapping.append((int(start), int(start) + int(length)))
mapping.sort(key=lambda x: x[1]) # Sort by start.
return mapping
def Tokenize(story, corpus):
"""Tokenizes a news story.
Args:
story: The Story.
corpus: The corpus of the news story.
Returns:
A TokenizedStory containing the URL and the tokens or None if no token
mapping was found for the URL.
"""
s = story.ToString()
url_hash = Hashhex(story.url)
mapping_filename = '%s/tokens/%s.txt' % (corpus, url_hash)
if not os.path.exists(mapping_filename):
return None
mapping = LoadTokenMapping(mapping_filename)
tokens = []
for (start, end) in mapping:
tokens.append(s[start:end + 1])
return TokenizedStory(story.url, tokens)
def LoadEntityMapping(filename):
"""Loads an entity mapping from the given filename.
Args:
filename: The filename containing the entity mapping.
Returns:
A list of (entity_index, start, end)
where start and end (inclusive) are token offsets for an entity. The list
is sorted.
"""
mapping = []
with open(filename) as f:
line = f.readline().strip()
for entity_mapping in line.split(';'):
if not entity_mapping:
continue
entity_index, start, end = entity_mapping.split(',')
mapping.append((int(entity_index), int(start), int(end)))
mapping.sort(key=lambda x: x[2]) # Sort by start.
return mapping
def Anonymize(tokenized_story, corpus):
"""Anonymizes a tokenized news story.
Args:
tokenized_story: A TokenizedStory.
corpus: The corpus of the tokenized news story.
Returns:
A Story containing the URL, anonymized content and anonymized highlights or
None if no entity mapping exists for the news story.
"""
url_hash = Hashhex(tokenized_story.url)
mapping_filename = '%s/entities/%s.txt' % (corpus, url_hash)
if not os.path.exists(mapping_filename):
return None
mapping = LoadEntityMapping(mapping_filename)
mapping_index = 0
mapping_len = len(mapping)
new_tokens = []
anonymization_info = {}
i = 0
while i < len(tokenized_story.tokens):
if mapping_index < mapping_len and mapping[mapping_index][1] == i:
entity_index, start, end = mapping[mapping_index]
anonymized_entity_name = '@entity%d' % entity_index
new_tokens.append(anonymized_entity_name)
anonymization_info[anonymized_entity_name] = ' '.join(
tokenized_story.tokens[start: end + 1]).replace(' - ', '-')
mapping_index += 1
i = end + 1
else:
new_tokens.append(tokenized_story.tokens[i])
i += 1
parts = ' '.join(new_tokens).split(' @ highlight ')
content = parts[0]
highlights = parts[1:]
return AnonymizedStory(
tokenized_story.url, content, highlights, anonymization_info)
entity_pattern = re.compile(r'@entity\d+')
def GenerateQuestionContexts(anonymized_story, context_token_limit):
"""Generates a list of question/answer pairs given an anonymized news story.
One question/answer pair is generated for each anonymized entity appearing in
the question.
Args:
anonymized_story: The anonymized news story.
context_token_limit: If the context of a news story is above the limit, the
empty list will be returned.
Returns:
A list of QuestionContext containing questions and answers.
"""
result = []
if anonymized_story.content.count(' ') + 1 > context_token_limit:
return result
entities_in_context = set(entity_pattern.findall(anonymized_story.content))
for highlight in anonymized_story.highlights:
for match in entity_pattern.finditer(highlight):
start, end = match.span()
answer = highlight[start:end]
if answer not in entities_in_context:
# Ignore entities that doesn't appear in the content as these will be
# impossible (or very hard to answer).
continue
question = ('%s@placeholder%s' %
(highlight[0:start], highlight[end:])).lower()
context = anonymized_story.content.lower()
url = anonymized_story.url
anonymization_info = anonymized_story.anonymization_info
result.append(
QuestionContext(url, context, question, answer, anonymization_info))
return result
def WriteQuestionContext(question_context, corpus, dataset):
"""Writes a question/answer pair to disk.
Args:
question_context: The QuestionContext to write containing the question and
answer.
corpus: The corpus the question/answer belongs to.
dataset: One of 'training', 'validation' and 'test'.
"""
s = question_context.ToString()
h = Hashhex(s)
with open('%s/questions/%s/%s.question' % (corpus, dataset, h), 'w') as f:
f.write(s)
class ProgressBar(object):
"""Simple progress bar.
Output example:
100.00% [2152/2152]
"""
def __init__(self, total=100, stream=sys.stderr):
self.total = total
self.stream = stream
self.last_len = 0
self.curr = 0
def Increment(self):
self.curr += 1
self.PrintProgress(self.curr)
if self.curr == self.total:
print ''
def PrintProgress(self, value):
self.stream.write('\b' * self.last_len)
pct = 100 * self.curr / float(self.total)
out = '{:.2f}% [{}/{}]'.format(pct, value, self.total)
self.last_len = len(out)
self.stream.write(out)
self.stream.flush()
datasets = ['training', 'validation', 'test']
def UrlMode(corpus, request_parallelism):
"""Finds Wayback Machine URLs and writes them to disk.
Args:
corpus: A corpus.
request_parallelism: The number of concurrent requests.
"""
for dataset in datasets:
print 'Finding Wayback Machine URLs for the %s set:' % dataset
old_urls_filename = '%s/%s_urls.txt' % (corpus, dataset)
new_urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
urls = ReadMultipleUrls(old_urls_filename)
p = ThreadPool(request_parallelism)
results = p.imap_unordered(WaybackUrl, urls)
progress_bar = ProgressBar(len(urls))
new_urls = []
for result in results:
if result:
new_urls.append(result)
progress_bar.Increment()
WriteUrls(new_urls_filename, new_urls)
def DownloadMapper(t):
"""Downloads an URL and checks that metadata is available for the URL.
Args:
t: a tuple (url, corpus).
Returns:
A pair of URL and content.
Raises:
RuntimeError: No metadata available.
"""
url, corpus = t
url_hash = Hashhex(url)
mapping_filename = '%s/entities/%s.txt' % (corpus, url_hash)
if not os.path.exists(mapping_filename):
raise RuntimeError('No metadata available for %s.' % url)
return url, DownloadUrl(url, corpus)
def DownloadMode(corpus, request_parallelism):
"""Downloads the URLs for the specified corpus.
Args:
corpus: A corpus.
request_parallelism: The number of concurrent download requests.
"""
missing_urls = []
for dataset in datasets:
print 'Downloading URLs for the %s set:' % dataset
urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
urls = ReadUrls(urls_filename)
missing_urls_filename = '%s/missing_urls.txt' % corpus
if os.path.exists(missing_urls_filename):
print 'Only downloading missing URLs'
urls = list(set(urls).intersection(ReadUrls(missing_urls_filename)))
p = ThreadPool(request_parallelism)
results = p.imap_unordered(DownloadMapper, izip(urls, repeat(corpus)))
progress_bar = ProgressBar(len(urls))
collected_urls = []
try:
for url, story_html in results:
if story_html:
collected_urls.append(url)
progress_bar.Increment()
except KeyboardInterrupt:
print 'Interrupted by user'
missing_urls.extend(set(urls) - set(collected_urls))
WriteUrls('%s/missing_urls.txt' % corpus, missing_urls)
if missing_urls:
print ('%d URLs couldn\'t be downloaded, see %s/missing_urls.txt.'
% (len(missing_urls), corpus))
print 'Try and run the command again to download the missing URLs.'
def StoreMapper(t):
"""Reads an URL from disk and returns the parsed news story.
Args:
t: a tuple (url, corpus).
Returns:
A Story containing the parsed news story.
"""
url, corpus = t
story_html = ReadDownloadedUrl(url, corpus)
if not story_html:
return None
raw_story = RawStory(url, story_html)
return ParseHtml(raw_story, corpus)
def StoreMode(corpus):
for dataset in datasets:
print 'Storing news stories for the %s set:' % dataset
urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
urls = ReadUrls(urls_filename)
p = Pool()
stories = p.imap_unordered(StoreMapper, izip(urls, repeat(corpus)))
progress_bar = ProgressBar(len(urls))
for story in stories:
if story:
WriteStory(story, corpus)
progress_bar.Increment()
def GenerateMapper(t):
"""Reads an URL from disk and returns a list of question/answer pairs.
Args:
t: a tuple (url, corpus).
Returns:
A list of QuestionContext containing a question and an answer.
"""
url, corpus, context_token_limit = t
story_html = ReadDownloadedUrl(url, corpus)
if not story_html:
return None
raw_story = RawStory(url, story_html)
story = ParseHtml(raw_story, corpus)
tokenized = Tokenize(story, corpus)
if not tokenized:
return None
anonymized = Anonymize(tokenized, corpus)
if not anonymized:
return None
return GenerateQuestionContexts(anonymized, context_token_limit)
def GenerateMode(corpus, context_token_limit):
for dataset in datasets:
print 'Generating questions for the %s set:' % dataset
urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
urls = ReadUrls(urls_filename)
p = Pool()
question_context_lists = p.imap_unordered(
GenerateMapper, izip(urls, repeat(corpus), repeat(context_token_limit)))
progress_bar = ProgressBar(len(urls))
for question_context_list in question_context_lists:
if question_context_list:
for question_context in question_context_list:
WriteQuestionContext(question_context, corpus, dataset)
progress_bar.Increment()
def RemoveMode(corpus):
missing_urls = set(ReadUrls('%s/missing_urls.txt' % corpus))
for dataset in datasets:
urls_filename = '%s/wayback_%s_urls.txt' % (corpus, dataset)
urls = ReadUrls(urls_filename)
new_urls = []
for url in urls:
if url not in missing_urls:
new_urls.append(url)
WriteUrls(urls_filename, new_urls)
def main():
parser = argparse.ArgumentParser(
description='Generates question/answer pairs')
parser.add_argument('--corpus', choices=['cnn', 'dailymail'], default='cnn')
parser.add_argument(
'--mode', choices=['store', 'generate', 'download', 'urls', 'remove'],
default='generate')
parser.add_argument('--request_parallelism', type=int, default=200)
parser.add_argument('--context_token_limit', type=int, default=2000)
args = parser.parse_args()
stories_dir = '%s/stories' % args.corpus
if not os.path.exists(stories_dir):
os.mkdir(stories_dir)
downloads_dir = '%s/downloads' % args.corpus
if not os.path.exists(downloads_dir):
os.mkdir(downloads_dir)
questions_dir = '%s/questions' % args.corpus
if not os.path.exists(questions_dir):
os.mkdir(questions_dir)
for dataset in datasets:
dataset_dir = '%s/questions/%s' % (args.corpus, dataset)
if not os.path.exists(dataset_dir):
os.mkdir(dataset_dir)
if args.mode == 'store':
StoreMode(args.corpus)
elif args.mode == 'generate':
GenerateMode(args.corpus, args.context_token_limit)
elif args.mode == 'download':
DownloadMode(args.corpus, args.request_parallelism)
elif args.mode == 'urls':
UrlMode(args.corpus, args.request_parallelism)
elif args.mode == 'remove':
RemoveMode(args.corpus)
if __name__ == '__main__':
main()
@arubis
Copy link

arubis commented Jan 27, 2017

I've found that your modifications to the original script actually generate garbage instead of usable English. Example from the original script:

http://web.archive.org/web/20080524054053id_/http://www.cnn.com:80/2008/POLITICS/03/04/march.4.gop/index.html?eref=rss_latest

@entity0 , @entity1 ( @entity2 ) -- @entity4 sen. @entity3 , whose @entity6 aspirations went into a nose dive last summer , clinched the @entity9 's presidential nomination tuesday night with a sweep of @entity9 contests in four states . " i am very , very grateful and pleased to note that tonight , my friends , we have won enough delegates to claim with confidence , humility and a great sense of responsibility , that i will be the @entity9 nominee for president of the @entity22 , " @entity3 told supporters in @entity1 . @entity2 estimates that @entity3 has amassed 1,195 delegates to the @entity9 's september convention in @entity25 , @entity26 , four more than the 1,191 needed to claim the party 's nomination . " now , we begin the most important part of our campaign : to make a respectful , determined and convincing case to the @entity22 people that our campaign and my election as president , given the alternative presented by our friends in the other party , is in the best interest in the country that we love , " @entity3 said . " the big battle 's to come , " he said . " i do not underestimate the significance nor the size of the challenge . " watch @entity3 address supporters after sweeping tuesday 's contests » @entity3 's last leading rival , former @entity49 gov. @entity48 , bowed out of the race after his projected losses in @entity1 , @entity52 , @entity53 and @entity54 and urged his supporters to back the @entity4 senator in november . " it 's now important that we turn our attention not to what could have been or what we wanted to have been but now what must be , and that is a united party , " @entity48 said . watch as @entity48 ends his presidential bid » claiming the title of presumptive nominee will give @entity3 a head start on the general election campaign while @entity71 contenders @entity69 and @entity72 are still locked in a battle for their party 's title , said @entity77 , a @entity9 strategist and @entity2 contributor . allocate delegates yourself and see how the numbers add up » " tomorrow , he can get started , " @entity77 said . " he 'll have the [ @entity83 ] behind him . he 'll have a broad base of financial support . it 's a big step . meanwhile , it looks like the @entity71 are engaged in the land war across @entity89 , so he 's got a big advantage now . " both @entity93 , the @entity95 senator and former first lady , and @entity72 , the first - term senator from @entity97 , called @entity3 on tuesday night , campaign officials said . @entity72 told @entity3 he looks forward to running against him in the fall , campaign spokeswoman @entity98 said . @entity3 is slated to go to the @entity6 on wednesday to receive the endorsement of president @entity101 , according to two @entity9 sources . the @entity4 senator 's campaign -- his second run for the @entity6 -- was largely written off last summer amid outspoken opposition from the party 's conservative base , a major staff shakeup and disappointing fundraising . but the former @entity112 pilot and @entity114 prisoner of war rebounded with wins in january 's primaries in @entity117 and @entity118 , the state where his first presidential bid foundered . " there were times , obviously , when my political campaign was not viewed as the most viable in @entity22 , as you probably know , " he told reporters in @entity125 earlier tuesday . " in fact , i was reminded of the words of chairman @entity129 , who said it 's always darkest before it 's totally black . " @entity3 's fortunes also rebounded as @entity22 commanders in @entity133 credited the 2007 launch of a campaign to pacify @entity136 and its surrounding provinces with a sharp decline in @entity22 and @entity133 casualties . the senator had been one of the most outspoken advocates of the shift and has blasted his potential @entity71 rivals for calling for the withdrawal of @entity22 combat troops from the widely unpopular war . " this is a man with a lot of trials in his life , " said former education secretary @entity147 , a @entity2 contributor . " he 's had a lot of downs ; he 's been up , and this is a big up . " @entity3 has been turning his fire on the @entity71 , for whom tuesday 's races in @entity52 and @entity1 are seen as pivotal . see scenes from tuesday 's voting » but @entity71 have been pounding @entity3 over his january comment that he would be satisfied if @entity22 troops remained in @entity133 for 100 years , as long as the insurgency there died down . and @entity165 chairman @entity164 has attacked his reputation as a reformer over the past week , accusing @entity3 of trying to evade federal spending limits by opting out of public financing after using the promise of federal funds to obtain a bank loan and automatic ballot access for his primary campaign . @entity164 told @entity2 on tuesday that @entity3 " really is the focus of what we 're doing now , in terms of his ethics problems and his problems with the war and his problems with the huge deficits that they 've run up on the @entity9 side . " in 2000 , @entity3 upset then - @entity1 gov. @entity101 in the @entity117 primary by touting " straight talk " and his record as a @entity9 maverick . @entity101 came back in @entity118 amid a divisive and bitter campaign that left @entity3 denouncing leaders of the party 's religious conservative wing as " agents of intolerance , " and @entity101 went on to win the presidency . since then , @entity3 has enraged conservative leaders by opposing @entity101 's signature tax cuts , co-sponsoring the campaign finance reform law that now bears his name and supporting a controversial @entity6 - backed plan to offer a path to citizenship for illegal immigrants . but their support was spread among a fractured @entity9 field , and their main standard - bearer , former @entity208 gov. @entity207 , quit the race after a disappointing showing in february 's @entity212 primaries . exit polls in @entity1 and @entity52 found that about three - quarters of @entity9 would be satisfied with @entity3 as their nominee , however . those surveys found that the economy was the top issue for @entity9 voters in both states -- and by a wide margin in @entity52 , which has seen a sharp decline in manufacturing jobs in the past decade . although national security issues are a strong suit for @entity3 , @entity77 said he might need some help if a weakening economy is the central issue in november . " it 's never been sen. @entity3 's strength , " @entity77 said . he said @entity3 would need to make the case that " i 'm going to grow this economy ; @entity72 or @entity69 , they 're going to grow government . " @entity3 had amassed 1,047 delegates before tuesday , according to @entity2 estimates . at stake in tuesday 's contests were 256 delegates , allocated on a winner - take - all basis by statewide or congressional district results . e-mail to a friend @entity2 correspondent @entity244 and political editor @entity245 contributed to this report .

@entity2 projects @entity3 wins @placeholder , @entity1 , @entity53 and @entity54

@entity52

@entity22:U.S.
@entity26:Minnesota
@entity25:Minneapolis-St. Paul
@entity98:Jennifer Psaki
@entity117:New Hampshire
@entity83:Republican National Committee
@entity112:Navy
@entity89:Russia
@entity136:Baghdad
@entity133:Iraq
@entity118:South Carolina
@entity3:McCain
@entity2:CNN
@entity1:Texas
@entity0:DALLAS
@entity6:White House
@entity212:Super Tuesday
@entity4:Arizona
@entity93:Clinton
@entity9:GOP
@entity53:Rhode Island
@entity52:Ohio
@entity54:Vermont
@entity77:Castellanos
@entity71:Democrats
@entity72:Barack
@entity101:Bush
@entity245:Mark Preston
@entity244:Dana Bash
@entity129:Mao
@entity97:Illinois
@entity95:New York
@entity125:San Antonio
@entity207:Mitt Romney
@entity147:William Bennett
@entity164:Dean
@entity165:Democratic National Committee
@entity208:Massachusetts
@entity69:Hillary
@entity114:Vietnam
@entity48:Huckabee
@entity49:Arkansas%

Example output from your modified script:

http://web.archive.org/web/20150202183733id_/http://www.cnn.com/2014/02/02/us/super-bowl/

@entity0 le seah wks fla ten den @entity1 8 in  @entity2 owl  lvii  s art d out   @entity3 e denv r  b r nc s. the  e ttle seahaw s made  ure it got worse.

s at le r @entity1 -8 by pla y ng a suff cat ng defense an  takin  a vantage of  @entity8 n ver @entity9 nc udin  two int rcepti ns  hro n by q u ter ack  eyton  ann ng, the  nf 's m st valu a @entity1 y r for t e s ason

the g a @entity3 d pro foo ball'  best offen e, de ve @entity16 st @entity17 e @entity18 r a  isa peare  ear y at metlife s t d um  n ea t ru herford  new @entity8
on the fi st pla  fr m sc imm g , a ba d snap w @entity3 n   p a t ma ning an  landed i  the   @entity1  fo r a safet , giv ng  eattl  a 2-0  l ad w tho t ru nin  a  la . den v @entity3  s ore unt l t e  hird q arte @entity1 the gam  was ou  of r ach.

s e ttle's d fens  wa  so @entity3 t @entity8 r ma ag d o ly 27 yard  ru hing, co par d w i h 1 5 fo  seatt e. m nni g wen  34 for 49 t  ga n 279 yar s in th  air, but  m st  asses were  f @entity34  w @entity3 r ceiv rs qui kl  t ken  own  his  oun erpar , r ssell wils o of seattl e   @entity3 or 26 to  ain 20 6 yards and  @entity36 h own .

fi tin ly,  a  eatt e defens ve pla y @entity3 cke  ma colm  @entity39 am d the gam 's most   @entity8 pla er.

sea tl  wo  its  irs   uper b @entity39 hise   story  ma ning w a @entity3 d in h s q est f r a s e ond @entity45 .

aft r  h   s afet , s att e kicked tw o fie @entity2   @entity36 hawn lynch  @entity8 n   1-yar  run for a t uch own. the se haw s'  smi h interce p d m nning'  sec nd inte cep ion and @entity1 e  it  @entity3 he @entity51 the sec nd  alf sta ed out  ad f r d n er, with  s @entity54 vin ru n ng   the o enin   ki @entity34  tou hdow .

j rm ine kears  caught a 2 -yeard  ass from  r @entity34 ilson wit  2:58  eft in  he thi d  @entity59 othe  touc down  w lso  threw   s se ond sc rin  stri e to do g bal d @entity1  11 45 l ft  n the four h.

 @entity8 re th   hi r d qu rter en @entity62 n @entity1 core oar   hen   manni @entity63 re  a 1 -ya d pas  t o   d @entity1   hom s. den er   went   for @entity8 o nt  at  @entity9 ak  the score 3 6 8.

denver, l e  by the 3 -ye r-old  anning , @entity66 s  of @entity3 atistic lly, b t n ver go @entity67 .

ri hard sh rma @entity69 e seattle de f nse playe  who m cked san f r nc sco  fter wi ning t e  fc  hampio ship, w a never really cha le @entity72 ft wit  an inju y i n t e fourth q art r.

balm  con iti ns o  gam e  @entity39   he g me @entity74 ien e the bla kout t at hit l a  yea 's s per  owl,  ans in  os  n eles h d t eir ow  vis al bla kout.
 some fan  lo t cable  ervi c  for a p @entity84 sse  pa @entity85 a @entity86 w, s ar i g bru o ma s a d the red hot chil  peppe s.  ime wa ner cabl e a d i  a  weet that th  issu  was r s @entity39  the  nd o  the game

 @entity1 a  shot  ever a  ti es  uri g a s per bow  pa ty a t a r sidenc  in denve @entity97  th  ma  was found outs de the home , but  ol ce s oke ma  so ny jacks o did not hav  addi io a  informa io , such a  what led   o t e shooting.
t e m n wa   aken to a  o pital i  critical c o iti n, and  a thori es do n t  ave @entity39 o  a sus ect des ipt on.

th  e per @entity39 istory    the super @entity3 the numbe s

 he se rets of t e  @entity1 ow  flyover

seattl  do @entity8 s adva tag  of denver's  s takes

@h hlig t

 eag e mvp @placeholder ing thr ws @entity3 er eptions

 highlig

this was he @entity3 per bo l v ctory in s attle'  f a @entity39

@entity39

@entity84:riod  nd m
@entity85:t o  th  seco d half
@entity86:d th  halft me sh
@entity3:first S
@entity2:d goals,
@entity1:uper B
@entity0:Steam olle : Seat
@entity9:o m
@entity8:inates and tak
@entity51: touc down.

@entity54:ttle's P rcy Ha
@entity74:did 't expe
@entity59:uart r for a
@entity72:ged. He
@entity17:defe se, Seattl
@entity16:, again t the b
@entity34:ssell
@entity36:d Mar
@entity39:chise hist
@entity18: bu  the d
@entity97:, pol ce said
@entity45:Super Bo l rin
@entity69:, t
@entity66:as the  FL's be
@entity67: mo ing Sunda
@entity62:ed, Denve  got o
@entity63:g t%

Note significant spacing errors. My suspicion is that adding title breaks AnonymizedStory but I haven't experimented to fix it yet. Since I'm trying to do something Actually Useful based on the target dataset, I'll give this a few tries & let you know if I find a fix.

@arubis
Copy link

arubis commented Jan 27, 2017

Heh. Turns out to be an easy fix.
I haven't totally debugged, but it's the Tokenize function that's choking, and it does it when you call Story.ToString(). Here's your current Story class:

class Story(namedtuple('StoryBase', 'url content highlights title')):

  def ToString(self):
    return self.title + '\n' + self.content + ''.join([
        '\n\n@highlight\n\n' + highlight
        for highlight in
self.highlights])

If you drop the self.title + '\n' + and just return the story content without title, the tokenizer is right as rain. So instead do:

class Story(namedtuple('StoryBase', 'url content highlights title')):

  def ToString(self):
    return self.content + ''.join([
        '\n\n@highlight\n\n' + highlight
        for highlight in
self.highlights])

...and have a look at your shiny new output.

I haven't trained on this set yet, but it sure looks a lot better (and hope it helps you out, too!).

@arubis
Copy link

arubis commented Jan 27, 2017

For all that, I just now realize you were probably using --mode=store all along.
In any case, the above fixes --mode=generate should you want to use its output :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment