Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
JSON API Crawler in Python
import requests
import pypyodbc
import json
import time
import random
import os
##############
# Parameters #
url = "https://example.com"
loop_times = 200000 # Set times of loops
sleep_time = False # Do you need a sleep between two requests?
# Your database location, MS Access Database ONLY!
db_location = "c:\\WorkDirectory\GitHub\py-wd\crawler_v1.0\hitokoto.accdb"
remove_old_db = False # Do you want to remove the old database file? Make sure that contains nothing important!
create_new_db = False # Do you want to create a new database?
table_name = "hitokoto_us" # The table you are going to write!
create_new_table = False # Do you want to create a new table (with the name above)?
is_first_time = False # Is it the first time to write in this table?
##############
# Create table
def Create_Table():
SQL = 'CREATE TABLE ' + table_name +' (id int,hitokoto varchar(255),source varchar(255),add_time DateTime)'
conn.cursor().execute(SQL)
cur.commit()
def Request_Job(index):
response = requests.get(url)
#print (response.text)
data = response.json()
print ('Writing id = ' + str(index))
print (data['text'])
print (data['source'])
print (data['catname'])
# Write
hitokoto_query = "\'" + data['text'] + "\'"
hitokoto_query.replace("'", '\'')
hitokoto_query.replace("''", '\"')
data['text'].replace("\'\'", '') # Fix a Speciall bug...
is_new = Check_Dup(hitokoto_query)
if is_new:
Write_DB(data,index)
# Time sleep
if sleep_time:
timer = random.randint(0,3)
time.sleep(timer/10)
# Insert Data
def Write_DB(data,index):
localtime = time.strftime("%Y-%m-%d", time.localtime())
sql_insert = '''INSERT INTO ''' + table_name + '''(id,hitokoto,source,catname,add_time) VALUES(?,?,?,?,?)'''
insert_value = (index, data['text'], data['source'], data['catname'], localtime)
cur.execute(sql_insert, insert_value)
cur.commit()
# Check duplicate or not
def Check_Dup(check_hitokoto):
SQL_query = '''SELECT * FROM ''' + table_name + ''' WHERE `hitokoto` = ''' + check_hitokoto
cur.execute(SQL_query)
row = cur.fetchone()
if row:
print ('Nothing Speciall!')
return False
else:
print ('This is New!')
return True
##############
# Main Start #
##############
# Remove old accdb
if remove_old_db:
os.remove(db_location)
# Creat new database
if create_new_db:
connection = pypyodbc.win_create_mdb(db_location)
# Connect to accdb
connStr = 'Driver={Microsoft Access Driver (*.mdb)};DBQ=' + db_location
conn = pypyodbc.win_connect_mdb(connStr)
# Create a cursor
cur = conn.cursor()
# Creater a TABLE
if create_new_table:
Create_Table()
if is_first_time:
Request_Job(1) # For the first time. Must comment this line in the second time! Important!
# Requests Start
print ('Begin!')
for x in range(1, loop_times):
try:
SQL_max_id = 'SELECT MAX(id) FROM ' + table_name
cur.execute(SQL_max_id)
max_id = cur.fetchone()[0]
Request_Job(max_id + 1)
print ("Info: Job Finished!")
except: # Handeling Exceptions
print ("Error: Operating Failed!")
pass
else:
print ("Succeed")
times = round(x*(50/loop_times))
print (u"\u2588" * times + u"\u2592" * (50-times))
print ('All Done!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.