Skip to content

Instantly share code, notes, and snippets.

@arunkarnann
Created February 8, 2018 10:08
Show Gist options
  • Save arunkarnann/a96c63ae03d3bfab85fe811e3dd1f241 to your computer and use it in GitHub Desktop.
Save arunkarnann/a96c63ae03d3bfab85fe811e3dd1f241 to your computer and use it in GitHub Desktop.
Jd Scrapper
import requests
from lxml import html
from tkinter import *
#import tkinter as ttk
import re
import datetime
import os
from firebase import firebase
import hashlib
#import App as App
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
path_init = '//li[@class="cntanr"]'
path_url = '//li/section/div[1]/section/div[1]/h4/span/a'
path_name = '//html/body/div[2]/div/div[1]/div/div[2]/div[1]/div[3]/div/span[1]/span[1]//text()'
path_area = '//html/body/div[2]/div/div[1]/div/div[2]/div[1]/div[3]/div/span[3]//text()'
path_full_phone_number = '//span[@class="vstwp"]/span//text()'
path_categories = '//html/body/div[2]/div/div[1]/div/div[3]/div[1]/div/div[8]/ul/li//text()'
path_address = '//span[@class="lctadrs"]//text()'
file_name = ''
pc_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'}
mobile_headers = {'User-Agent':'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'}
y = []
global activated
activated = False
#driver = webdriver.Firefox()
def filenameinit(astring):
astring = astring.replace('https://www.jd.com/','')
astring= astring.replace('/','')
astring = astring.replace(' ', '')
global file_name
file_name = astring+timestring()+'.csv'
init_path = str(os.getcwd())
file_name = init_path+'\output\\'+file_name.replace('\n', '')
print(file_name)
def timestring():
now = datetime.datetime.now()
wholestring = str(now.year)+'-'+str(now.month)+'-'+str(now.day)+'-'+str(now.hour)+'-'+str(now.minute)+'-'+str(now.second)
print(wholestring)
return wholestring
#Delete file contents
def ClearFile():
with open(file_name,'w') as f:
f.write('"Name","Area","Listed In ","Address","Pincode","Phone Numbers"\n')
f.close()
def text_merge(input_list):
all_items = '"'
for i in input_list:
if(input_list[-1] == i):
all_items = all_items+i
else:
all_items = all_items+i+','
all_items = all_items+'"'
print(all_items)
return all_items
def text_merge_numbers(input_list):
all_items = ''
for i in input_list:
if(input_list[-1] == i):
all_items = all_items+i
else:
all_items = all_items+i+','
print(all_items)
return all_items
def seperate(mydoc):
y = []
for x in mydoc:
y = x.xpath(path_url)
#print(y)
full_seperate(y)
def extract_pincode(string):
digitlist = re.findall('\d+',string)
for i in digitlist:
if(len(i) == 6):
return i
return ''
def internet_on():
try:
requests.get('http://216.58.192.142', timeout=1)
return True
except:
return False
def Check():
if(internet_on()):
#print("Working")
return True
else:
print("Internet Connectivity lost | Trying reconnecting")
time.sleep(10)
Check()
def full_seperate(y):
url_two = ''
data_two = []
mydoc_three = []
mydoc_four = []
mydoc_phone = []
mydoc_six = []
for i in y:
#print(i.attrib['href'])
url_two = i.attrib['href'].replace('www','t')
if(Check()):
try:
data_two = requests.get(url_two, headers=mobile_headers)
except:
print("Internet Connectivity lost ")
continue
#mydoc_two = html.fromstring(data_two.content).xpath(path_phone)
mydoc_three = html.fromstring(data_two.content).xpath(path_name)
mydoc_four = html.fromstring(data_two.content).xpath(path_area)
mydoc_six = html.fromstring(data_two.content).xpath(path_categories)
mydoc_address = html.fromstring(data_two.content).xpath(path_address)
mydoc_phone_number = html.fromstring(data_two.content).xpath(path_full_phone_number)
print(mydoc_phone_number)
if(len(mydoc_address) >= 1):
mydoc_pincode = extract_pincode(mydoc_address[0])
else:
mydoc_pincode = ''
mydoc_five_refine = []
for i in mydoc_phone_number:
if(not boolvar.get()):
if(len(i) > 9):
if('(91)' in i):
if(i.count('-') <= 1 ):
mydoc_five_refine.append(i)
else:
if(i.count('-')==0):
mydoc_five_refine.append(i)
else:
if(len(i) > 8):
mydoc_five_refine.append(i)
if(len(mydoc_six)>0):
mydoc_six.pop(len(mydoc_six)-1)
with open(file_name,'a') as f:
f.write(str(text_merge(mydoc_three))+','+str(text_merge(mydoc_four))+','+str(text_merge(mydoc_six))+','+str(text_merge(mydoc_address))
+','+str(mydoc_pincode)+','+str(text_merge_numbers(mydoc_five_refine))+'\n')
def Start(url):
for i in range(1,11):
url_page = url+'/page-'+str(i)
print(str(i))
mydoc = []
data = requests.get(url_page, headers=pc_headers)
mydoc = html.fromstring(data.content).xpath(path_init)
seperate(mydoc)
def Start2(url,driver):
driver.get(url)
numb = 0
while numb<15:
if(Check()):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
print(numb)
numb = numb + 1
time.sleep(10)
elem = driver.find_element_by_xpath("//*")
source_code = elem.get_attribute("outerHTML")
seperate(html.fromstring(source_code))
def date_key():
try:
#Getting Created Time is Unique and protects from copying file
mtime = os.path.getctime('string.txt')
except OSError:
mtime = 0
last_modified_date = datetime.datetime.fromtimestamp(mtime)
m = hashlib.md5()
m.update((str(last_modified_date)).encode('utf-8'))
print(m.hexdigest())
hexkey = m.hexdigest()
return hexkey
def check_key_file(key_string='H'):
if((firebase.get('/KEYS',key_string) == 'NONE') or (firebase.get('/KEYS',key_string) == date_key())):
f = open("string.txt",'w')
f.write(key_string)
f.close()
firebase.patch('/KEYS',{key_string:date_key()})
#print("FILE CREATED")
activated = True
# submit_button.config(state=NORMAL)
activate_button.config(state=DISABLED)
key_filled.config(state=DISABLED)
else:
#print("INVALID KEY")
#result['text'] = "Invalid Key"
activated = False
submit_button.config(state=DISABLED)
def check_key(key_string='H'):
if(Check()):
if((firebase.get('/KEYS',key_string) == 'NONE') ):
print("NEW KEY")
f = open("string.txt",'w')
f.write(key_string)
f.close()
firebase.patch('/KEYS',{key_string:date_key()})
#print("FILE CREATED")
activated = True
submit_button.config(state=NORMAL)
activate_button.config(state=DISABLED)
key_filled.config(state=DISABLED)
else:
#print("INVALID KEY")
activated = False
#result['text'] = "Invalid Key"
submit_button['state'] = 'disabled'
submit_button.config(state=DISABLED)
def key_validation(key_string="H"):
#print("ENTERING")
try:
file = open("string.txt",'r')
print("FILE EXIST")
the_key = file.read()
print(the_key)
print(date_key())
print(firebase.get('/KEYS',the_key))
if(firebase.get('/KEYS',the_key) == date_key()):
print("KEY MATCHED")
activated = True
activated_key = file.read()
#print(activated_key)
submit_button.config(state=NORMAL)
activate_button.config(state=DISABLED)
key_filled.insert(END,activated_key)
key_filled.config(state=DISABLED)
else:
submit_button.config(state=DISABLED)
result['text'] = 'Contact Admin'
return
except:
check_key(key_filled.get())
def End():
print("DONE")
result['text'] = 'Finished Scraping ' + 'Check :'+file_name
def button_clicked(event):
if(submit_button['state'] == 'disabled'):
#print("OOPS YOU CANT")
return
else:
url= 'input.txt'
result['text'] = "Processing..."
driver = webdriver.Firefox()
with open(url,'r') as f:
for line in f:
filenameinit(line)
ClearFile()
Start2(line,driver)
End()
def activate_key(event):
key_validation(key_filled.get())
print(os.getcwd())
#Region Tk
root = Tk()
root['height'] = 400
root['width'] = 600
global firebase
firebase = firebase.FirebaseApplication('##URL##',None)
f1 = Frame(root)
f1["height"] = root["height"]
f1["width"] = root["width"]
root.title("JD Scrapper - Gear Up Studio ")
Label(f1,text = "Input Url : Example : ##URL##").grid(row=0,column = 0,)
def getBool(event):
print(boolvar.get())
#Check Button
global boolvar
boolvar = BooleanVar()
boolvar.set(False)
boolvar.trace('w', lambda *_: print("The value was changed"))
cb = Checkbutton(f1, text = "Tele Phone number", variable = boolvar)
cb.bind("<Button-1>", getBool)
cb.grid(row=1, column=1)
global key_filled
key_filled = Entry(f1,width=50)
key_filled.grid(row=2,column=0)
key_filled.focus_set()
global activate_button
activate_button = Button(f1 , text="Active Now")
activate_button.bind("<Button-1>",activate_key)
activate_button.grid(row=2, column=1)
result = Label(f1, width=50)
result.grid(row=1,column=2)
global submit_button
submit_button = Button(f1 , text="Scrap Now")
submit_button.bind("<Button-1>",button_clicked)
submit_button.grid(row=1, column=0)
submit_button.config(state=NORMAL)
key_validation()
f1.pack()
root.mainloop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment