Skip to content

Instantly share code, notes, and snippets.

@wearpants
Last active March 27, 2017 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wearpants/fae802e1d2e2f9158b06e7121e877181 to your computer and use it in GitHub Desktop.
Save wearpants/fae802e1d2e2f9158b06e7121e877181 to your computer and use it in GitHub Desktop.
groupby file scraper
from itertools import groupby
import csv
def parse(self, response):
with open(self.csv_file) as f:
rows = csv.DictReader(f)
for source, group in groupby(rows, lambda r: r.get('Source Url')):
meta = {
'source_url': source,
'source_anchor': "",
'depth': 1,
'hops_from_seed': 1,
}
yield PageItem(
url=source,
content=b"",
headers={},
status=200,
source_url="",
source_anchor="",
depth=1,
hops_from_seed=1,
file_urls=[(r.get('Document Url'), meta) for r in group]
)
## I'm not really sure what this is doing differently than above, other than depth=0?
# if file_urls and last_source:
# yield PageItem(
# url=last_source,
# content=b"",
# headers={},
# status=200,
# source_url="",
# source_anchor="",
# depth=0,
# hops_from_seed=0,
# file_urls=file_urls
# )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment