Skip to content

Instantly share code, notes, and snippets.

@acviana
Created July 21, 2014 06:33
Show Gist options
  • Save acviana/2bb43adcdfdba69593c5 to your computer and use it in GitHub Desktop.
Save acviana/2bb43adcdfdba69593c5 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
import requests
import json
with open('driver-en.js', 'r') as f:
data = f.readlines()
output_list =[]
for counter, record in enumerate(data):
if 'if' in record:
# Use the requests library to retrieve the merchant js file
# and the json library to load it into a dictionary
r = requests.get(record.split()[-2][:-1].strip('"'))
merchant_data = r.text.replace('site_json = ','')[:-2]
merchant_data = json.loads(merchant_data)
# Parse out the rest of the data of interest
record_dict = {}
record_dict['merchant_id'] = r.url.split('/')[-1].replace('.js','')
record_dict['deal_count'] = len(merchant_data['deals'])
record_dict['image'] = merchant_data['88x31']
# Cheat a little and throw out the non-ascii characters so we
# can print everything later
record_dict['merchant_name'] = merchant_data['name'].encode('ascii',errors='ignore')
# Parse out the Regex field from the driver-en.js file.
regex_start = record.find('RegExp(')
regex_end = record.find(') {')
record_dict['regex'] = record[regex_start:regex_end]
print record_dict['regex']
output_list.append(record_dict)
if counter % 50 == 0:
print counter
with open('sgt-webscrapping-output.txt', 'w') as f:
for record in output_list:
line = '{merchant_name}, {regex}, {merchant_id}, {deal_count}, {image} \n'.format(**record)
f.write(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment