Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@mashirozx
Last active June 12, 2019 05:46
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save mashirozx/a8accbc6db88da06bee78346fc7e01b8 to your computer and use it in GitHub Desktop.
Save mashirozx/a8accbc6db88da06bee78346fc7e01b8 to your computer and use it in GitHub Desktop.
JSON API Crawler in Python
import requests
import pypyodbc
import json
import time
import random
import os
##############
# Parameters #
url = "https://example.com"
loop_times = 200000 # Set times of loops
sleep_time = False # Do you need a sleep between two requests?
# Your database location, MS Access Database ONLY!
db_location = "c:\\WorkDirectory\GitHub\py-wd\crawler_v1.0\hitokoto.accdb"
remove_old_db = False # Do you want to remove the old database file? Make sure that contains nothing important!
create_new_db = False # Do you want to create a new database?
table_name = "hitokoto_us" # The table you are going to write!
create_new_table = False # Do you want to create a new table (with the name above)?
is_first_time = False # Is it the first time to write in this table?
##############
# Create table
def Create_Table():
SQL = 'CREATE TABLE ' + table_name +' (id int,hitokoto varchar(255),source varchar(255),add_time DateTime)'
conn.cursor().execute(SQL)
cur.commit()
def Request_Job(index):
response = requests.get(url)
#print (response.text)
data = response.json()
print ('Writing id = ' + str(index))
print (data['text'])
print (data['source'])
print (data['catname'])
# Write
hitokoto_query = "\'" + data['text'] + "\'"
hitokoto_query.replace("'", '\'')
hitokoto_query.replace("''", '\"')
data['text'].replace("\'\'", '') # Fix a Speciall bug...
is_new = Check_Dup(hitokoto_query)
if is_new:
Write_DB(data,index)
# Time sleep
if sleep_time:
timer = random.randint(0,3)
time.sleep(timer/10)
# Insert Data
def Write_DB(data,index):
localtime = time.strftime("%Y-%m-%d", time.localtime())
sql_insert = '''INSERT INTO ''' + table_name + '''(id,hitokoto,source,catname,add_time) VALUES(?,?,?,?,?)'''
insert_value = (index, data['text'], data['source'], data['catname'], localtime)
cur.execute(sql_insert, insert_value)
cur.commit()
# Check duplicate or not
def Check_Dup(check_hitokoto):
SQL_query = '''SELECT * FROM ''' + table_name + ''' WHERE `hitokoto` = ''' + check_hitokoto
cur.execute(SQL_query)
row = cur.fetchone()
if row:
print ('Nothing Speciall!')
return False
else:
print ('This is New!')
return True
##############
# Main Start #
##############
# Remove old accdb
if remove_old_db:
os.remove(db_location)
# Creat new database
if create_new_db:
connection = pypyodbc.win_create_mdb(db_location)
# Connect to accdb
connStr = 'Driver={Microsoft Access Driver (*.mdb)};DBQ=' + db_location
conn = pypyodbc.win_connect_mdb(connStr)
# Create a cursor
cur = conn.cursor()
# Creater a TABLE
if create_new_table:
Create_Table()
if is_first_time:
Request_Job(1) # For the first time. Must comment this line in the second time! Important!
# Requests Start
print ('Begin!')
for x in range(1, loop_times):
try:
SQL_max_id = 'SELECT MAX(id) FROM ' + table_name
cur.execute(SQL_max_id)
max_id = cur.fetchone()[0]
Request_Job(max_id + 1)
print ("Info: Job Finished!")
except: # Handeling Exceptions
print ("Error: Operating Failed!")
pass
else:
print ("Succeed")
times = round(x*(50/loop_times))
print (u"\u2588" * times + u"\u2592" * (50-times))
print ('All Done!')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment