Skip to content

Instantly share code, notes, and snippets.

@tokoroten
Created March 16, 2017 22:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tokoroten/dfaed27e6897ef2e3de0511640d52a7c to your computer and use it in GitHub Desktop.
Save tokoroten/dfaed27e6897ef2e3de0511640d52a7c to your computer and use it in GitHub Desktop.
Kickstarter Crawler
#coding: utf-8
import urllib.request
import json
import datetime
import os
import time
try:
os.mkdir("result")
except:
pass
search_term = ""
sort_key = 'newest'
category_list = [16, 331, 332, 333, 334, 335, 336, 337, 52, 362, 338, 51, 339, 340, 341, 342] # technology category
for category_id in category_list:
project_count = 0
for page_id in range(1, 201):
try:
query = "http://www.kickstarter.com/projects/search.json?term=%s&category_id=%d&page=%d&sort=%s" % (search_term, category_id, page_id, sort_key)
response_json = json.loads(urllib.request.urlopen(query).read())
except:
break
if len(response_json["projects"]) == 0:
break
project_count += len(response_json["projects"])
total_hits = response_json["total_hits"]
print(category_id ,"progress", project_count, "/", total_hits, round(float(project_count)/total_hits * 100, 2), "%")
for project in response_json["projects"]:
filepath = "result/%d.json" % project["id"]
fp = open(filepath, "w")
fp.write(json.dumps(project, sort_keys=True, indent=2))
fp.close()
time.sleep(1)
# coding:utf-8
import os
import glob
import json
import datetime
import csv
fp = open("kickstarter_result.csv", "wb")
cw = csv.writer(fp,lineterminator="\n")
cw.writerow([
"category",
"pledged","goal",
"currency",
"backers_count",
"state",
"name",
"launched_at",
"deadline",
"blurb",
"url",
"creator_id",
"country"
])
for filename in glob.glob("result/*.json"):
project = json.loads(open(filename).read())
items = []
items.append(project["category"]["slug"])
items.append(project["pledged"])
items.append(project["goal"])
items.append(project["currency"])
items.append(project["backers_count"])
items.append(project["state"])
items.append(project["name"])
items.append(str(datetime.datetime.fromtimestamp(project["launched_at"])))
items.append(str(datetime.datetime.fromtimestamp(project["deadline"])))
items.append(project["blurb"].replace("\n", " ").replace("\r", " ").replace("," , " "))
items.append(project["urls"]["web"]["project"])
items.append(project['creator']['id'])
items.append(project['country'])
out = []
for item in items:
if type(item) is unicode:
item = item.encode("shift-jis", "ignore")
else:
item = str(item)
out.append(item)
cw.writerow(out)
print(filename)
fp.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment