Skip to content

Instantly share code, notes, and snippets.

@benagricola
Created June 22, 2017 16:51
Show Gist options
  • Save benagricola/0c22eeb64a94a66d3fe3b16e7a0cd7e7 to your computer and use it in GitHub Desktop.
Save benagricola/0c22eeb64a94a66d3fe3b16e7a0cd7e7 to your computer and use it in GitHub Desktop.
One-File, redistributable Scrapy based Crawler, using pyinstaller. Generate binary using pyinstaller scrape.spec
from PyInstaller.utils.hooks import collect_submodules, collect_data_files
# This hooks the scrapy project 'cot' to import all submodules, change name to match scrapy project
hiddenimports = (collect_submodules('cot'))
from PyInstaller.utils.hooks import collect_submodules, collect_data_files
# This collects all dynamically imported scrapy modules and data files.
hiddenimports = (collect_submodules('scrapy') +
collect_submodules('scrapy.pipelines') +
collect_submodules('scrapy.extensions') +
collect_submodules('scrapy.utils') +
collect_submodules('cot')
)
datas = collect_data_files('scrapy')
#!/usr/bin/env python
import json, sys
import scrapy, argparse
from scrapy.settings import Settings
from scrapy.crawler import CrawlerProcess
from cot.spiders.cot import CotSpider
from cot.utils import CotEncoder
def main():
parser = argparse.ArgumentParser(description='Website Crawler')
parser.add_argument('--domain', type=str, choices=['test.com'], required=True, help='Sets domain to crawl')
parser.add_argument('--mode', type=str, choices=['incremental', 'full'], nargs='?', default='incremental', help='Sets run mode. Incremental must be run on an existing directory')
parser.add_argument('--output', type=str, choices=['offline', 'public'], nargs='?', default='public', help='Sets output type for either the public or offline')
parser.add_argument('--print', action='store_true', dest='do_print', help='Prints resolved crawler settings and exits')
parser.add_argument('--arg', type=lambda kv: kv.split("="), dest='args', nargs='?', action='append', help='Specify multiple times with key=value arguments to override settings')
args = parser.parse_args()
settings_obj = Settings()
if args.output == 'offline':
from cot import offline_settings as settings
else:
from cot import public_settings as settings
settings_obj.setmodule(settings, priority='project')
if args.args:
settings_obj.setdict(dict(args.args), priority='cmdline')
# Force the CRAWL_TYPE setting
settings_obj.set('CRAWL_TYPE', args.mode, priority=settings_obj.maxpriority())
settings_obj.set('DOMAIN', args.domain, priority=settings_obj.maxpriority())
if args.do_print:
try:
print(json.dumps(dict(settings_obj), indent=4, separators=(',', ': '), sort_keys=True, cls=CotEncoder))
except Exception, e:
print('Unable to print resolved settings: %s' % (str(e)))
sys.exit(1)
return
process = CrawlerProcess(settings=settings_obj)
process.crawl(CotSpider)
process.start()
if __name__ == "__main__":
main()
block_cipher = None
options = []
a = Analysis(['scrape.py'],
pathex=['/opt/cot-scraper'],
binaries=[],
datas=[],
hiddenimports=[],
hookspath=['hooks'],
runtime_hooks=[],
excludes=[],
win_no_prefer_redirects=False,
win_private_assemblies=False,
cipher=block_cipher)
pyz = PYZ(a.pure, a.zipped_data,
cipher=block_cipher)
exe = EXE(pyz,
a.scripts,
a.binaries,
a.zipfiles,
a.datas,
options,
exclude_binaries=False,
name='scrape',
debug=False,
strip=False,
upx=True,
console=True)
@sam5-hub
Copy link

code:'from COT import offline_settings as settings',I can not see it in COT library.please tell me how can I find it.Thanks

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment