wearpants/file_scraper.py

## file_scraper.py
from itertools import groupby
import csv

def parse(self, response):
    with open(self.csv_file) as f:
        rows = csv.DictReader(f)

        for source, group in groupby(rows, lambda r: r.get('Source Url')):

            meta = {
                'source_url': source,
                'source_anchor': "",
                'depth': 1,
                'hops_from_seed': 1,
            }

            yield PageItem(
                url=source,
                content=b"",
                headers={},
                status=200,
                source_url="",
                source_anchor="",
                depth=1,
                hops_from_seed=1,
                file_urls=[(r.get('Document Url'), meta) for r in group]
            )


## I'm not really sure what this is doing differently than above, other than depth=0?
# if file_urls and last_source:
    # yield PageItem(
        # url=last_source,
        # content=b"",
        # headers={},
        # status=200,
        # source_url="",
        # source_anchor="",
        # depth=0,
        # hops_from_seed=0,
        # file_urls=file_urls
    # )
	from itertools import groupby
	import csv

	def parse(self, response):
	with open(self.csv_file) as f:
	rows = csv.DictReader(f)

	for source, group in groupby(rows, lambda r: r.get('Source Url')):

	meta = {
	'source_url': source,
	'source_anchor': "",
	'depth': 1,
	'hops_from_seed': 1,
	}

	yield PageItem(
	url=source,
	content=b"",
	headers={},
	status=200,
	source_url="",
	source_anchor="",
	depth=1,
	hops_from_seed=1,
	file_urls=[(r.get('Document Url'), meta) for r in group]
	)


	## I'm not really sure what this is doing differently than above, other than depth=0?
	# if file_urls and last_source:
	# yield PageItem(
	# url=last_source,
	# content=b"",
	# headers={},
	# status=200,
	# source_url="",
	# source_anchor="",
	# depth=0,
	# hops_from_seed=0,
	# file_urls=file_urls
	# )