-
-
Save tlmaloney/5650699 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python | |
""" | |
Downloads and cleans up a CSV file from a Google Trends query. | |
Usage: | |
trends.py google.username@gmail.com google.password /path/to/filename query1 [query2 ...] | |
Requires mechanize: | |
pip install mechanize | |
""" | |
import cookielib | |
import csv | |
import mechanize | |
import re | |
from StringIO import StringIO | |
import sys | |
def main(argv): | |
# Google Login credentials | |
username = argv[1] | |
password = argv[2] | |
# Where to save the CSV file | |
pathname = argv[3] | |
queries = ('q=' + query for query in argv[4:]) | |
br = mechanize.Browser() | |
# Create cookie jar | |
cj = cookielib.LWPCookieJar() | |
br.set_cookiejar(cj) | |
# Act like we're a real browser | |
br.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')] | |
# Login in to Google | |
response = br.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/') | |
forms = mechanize.ParseResponse(response) | |
form = forms[0] | |
form['Email'] = username | |
form['Passwd'] = password | |
response = br.open(form.click()) | |
# Get CSV from Google Trends | |
trends_url = 'http://www.google.com/trends/trendsReport?' | |
query_params = '&'.join(queries) | |
response = br.open(trends_url + query_params + '&export=1') | |
# Remove headers and footers from Google's CSV | |
# Use last date in date range | |
reader = csv.reader(StringIO(response.read())) | |
dates = [] | |
values = [] | |
for row in reader: | |
try: | |
date, value = row | |
except ValueError: | |
continue | |
if re.search('[0-9]{4}-[0-9]{2}-[0-9]{2}', date): | |
dates.append(date[-10:]) # Uses last date in time period | |
values.append(value) | |
with open(pathname, 'w') as f: | |
writer = csv.writer(f) | |
writer.writerow(['date', 'debt']) | |
for row in zip(dates, values): | |
writer.writerow(row) | |
if __name__ == '__main__': | |
sys.exit(main(sys.argv)) |
Mechanize automatically follows robots.txt, but it can be disabled assuming you have permission, or you have through the ethics through ..
Insert br.set_handle_robots(False) to avoid
httperror_seek_wrapper: HTTP Error 403: request disallowed by robots.txt
The script runs fine on my linux box at least as Tom instructed
This script is not working in Python 3.5.1, can you help upgrading it to work on latest version
@supermaxim I have installed Python 2.7.11 but after that getting the same "HTTP Error 403: request disallowed by robots.txt" error. I added "br.set_handle_robots(False)" but this is giving me another error of "unexpected indent" - where do I add this line?
Finally I am able to run the file now but its not giving any data... I am getting CSV file with just two column "date" and "debt" nothing else
I am using Python 2.7.10, and mechanize-0.2.5, but got the error below:
Traceback (most recent call last):
File "./trends.py", line 71, in
sys.exit(main(sys.argv))
File "./trends.py", line 42, in main
form['Passwd'] = password
File "/Library/Python/2.7/site-packages/mechanize/_form.py", line 2780, in setitem
control = self.find_control(name)
File "/Library/Python/2.7/site-packages/mechanize/_form.py", line 3101, in find_control
return self._find_control(name, type, kind, id, label, predicate, nr)
File "/Library/Python/2.7/site-packages/mechanize/_form.py", line 3185, in _find_control
raise ControlNotFoundError("no control matching "+description)
mechanize._form.ControlNotFoundError: no control matching name 'Passed'
Changed to 2 steps of login and it's working now:
Login in to Google
response = br.open('https://accounts.google.com/ServiceLogin?hl=en&continue=https://www.google.com/')
forms = mechanize.ParseResponse(response)
form = forms[0]
form['Email'] = username
response = br.open(form.click())
forms = mechanize.ParseResponse(response)
form = forms[0]
form['Passwd'] = password
response = br.open(form.click())
Hi Adam,
Try without the comma
and let me know if that works.