Created
November 6, 2013 16:27
-
-
Save danielfrg/7339270 to your computer and use it in GitHub Desktop.
Parse nutch segments into a TDF. Light on memory: only one line is loaded at a time and one html is stored at a time; on the other hand is more IO intensive. _input is the dumped html content form nutch:
_output is the tdf that is generated requirements:
pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
_input = 'dump0' | |
_output = 'html0.tdf' | |
df = pd.DataFrame({'url': [], 'html': []}) | |
df.to_csv(_output, sep='\t', index=None) | |
def append_tdf(urls, html): | |
df = pd.DataFrame({'url': [urls], 'html': [html]}) | |
df.to_csv(_output, mode='a', sep='\t', index=None, header=None) | |
i = 0 | |
urls = [] | |
htmls = [] | |
with open(_input) as f: | |
i = 0 | |
html = '' | |
url = '' | |
inside_content = False | |
inside_html = False | |
for line in f: | |
line = line.replace('\n', '') | |
if inside_content: | |
if inside_html: | |
if line.startswith('Recno:: ') or line.startswith('CrawlDatum::'): | |
inside_content = False | |
inside_html = False | |
append_tdf(url, html) | |
html = '' | |
url = '' | |
elif line == 'Content::': | |
inside_content = True | |
inside_html = False | |
append_tdf(url, html) | |
html = '' | |
url = '' | |
i += 1 | |
else: | |
html += line | |
else: | |
if line.startswith('url: '): | |
url = line.split('url: ')[1] | |
elif line == 'Content:': | |
inside_html = True | |
elif line == 'Content::': | |
inside_content = True | |
i += 1 | |
if html != '': | |
append_tdf(url, html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment