acviana/webscrapping.py

## webscrapping.py
#!/usr/bin/env python

import requests
import json

with open('driver-en.js', 'r') as f:
    data = f.readlines()

output_list =[]
for counter, record in enumerate(data):
    if 'if' in record:

        # Use the requests library to retrieve the merchant js file
        # and the json library to load it into a dictionary
        r = requests.get(record.split()[-2][:-1].strip('"'))
        merchant_data = r.text.replace('site_json = ','')[:-2]
        merchant_data = json.loads(merchant_data)

        # Parse out the rest of the data of interest
        record_dict = {}
        record_dict['merchant_id'] = r.url.split('/')[-1].replace('.js','')
        record_dict['deal_count'] = len(merchant_data['deals'])
        record_dict['image'] = merchant_data['88x31']

        # Cheat a little and throw out the non-ascii characters so we
        # can print everything later
        record_dict['merchant_name'] = merchant_data['name'].encode('ascii',errors='ignore')

        # Parse out the Regex field from the driver-en.js file.
        regex_start = record.find('RegExp(')
        regex_end = record.find(') {')
        record_dict['regex'] =  record[regex_start:regex_end]
        print record_dict['regex']

        output_list.append(record_dict)

    if counter % 50 == 0:
        print counter

with open('sgt-webscrapping-output.txt', 'w') as f:
    for record in output_list:
        line = '{merchant_name}, {regex}, {merchant_id}, {deal_count}, {image} \n'.format(**record)
        f.write(line)
	#!/usr/bin/env python

	import requests
	import json

	with open('driver-en.js', 'r') as f:
	data = f.readlines()

	output_list =[]
	for counter, record in enumerate(data):
	if 'if' in record:

	# Use the requests library to retrieve the merchant js file
	# and the json library to load it into a dictionary
	r = requests.get(record.split()[-2][:-1].strip('"'))
	merchant_data = r.text.replace('site_json = ','')[:-2]
	merchant_data = json.loads(merchant_data)

	# Parse out the rest of the data of interest
	record_dict = {}
	record_dict['merchant_id'] = r.url.split('/')[-1].replace('.js','')
	record_dict['deal_count'] = len(merchant_data['deals'])
	record_dict['image'] = merchant_data['88x31']

	# Cheat a little and throw out the non-ascii characters so we
	# can print everything later
	record_dict['merchant_name'] = merchant_data['name'].encode('ascii',errors='ignore')

	# Parse out the Regex field from the driver-en.js file.
	regex_start = record.find('RegExp(')
	regex_end = record.find(') {')
	record_dict['regex'] = record[regex_start:regex_end]
	print record_dict['regex']

	output_list.append(record_dict)

	if counter % 50 == 0:
	print counter

	with open('sgt-webscrapping-output.txt', 'w') as f:
	for record in output_list:
	line = '{merchant_name}, {regex}, {merchant_id}, {deal_count}, {image} \n'.format(**record)
	f.write(line)