Created
February 8, 2018 10:08
-
-
Save arunkarnann/a96c63ae03d3bfab85fe811e3dd1f241 to your computer and use it in GitHub Desktop.
Jd Scrapper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from lxml import html | |
from tkinter import * | |
#import tkinter as ttk | |
import re | |
import datetime | |
import os | |
from firebase import firebase | |
import hashlib | |
#import App as App | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
import time | |
path_init = '//li[@class="cntanr"]' | |
path_url = '//li/section/div[1]/section/div[1]/h4/span/a' | |
path_name = '//html/body/div[2]/div/div[1]/div/div[2]/div[1]/div[3]/div/span[1]/span[1]//text()' | |
path_area = '//html/body/div[2]/div/div[1]/div/div[2]/div[1]/div[3]/div/span[3]//text()' | |
path_full_phone_number = '//span[@class="vstwp"]/span//text()' | |
path_categories = '//html/body/div[2]/div/div[1]/div/div[3]/div[1]/div/div[8]/ul/li//text()' | |
path_address = '//span[@class="lctadrs"]//text()' | |
file_name = '' | |
pc_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:20.0) Gecko/20100101 Firefox/20.0'} | |
mobile_headers = {'User-Agent':'Mozilla/5.0 (Linux; U; Android 4.0.3; ko-kr; LG-L160L Build/IML74K) AppleWebkit/534.30 (KHTML, like Gecko) Version/4.0 Mobile Safari/534.30'} | |
y = [] | |
global activated | |
activated = False | |
#driver = webdriver.Firefox() | |
def filenameinit(astring): | |
astring = astring.replace('https://www.jd.com/','') | |
astring= astring.replace('/','') | |
astring = astring.replace(' ', '') | |
global file_name | |
file_name = astring+timestring()+'.csv' | |
init_path = str(os.getcwd()) | |
file_name = init_path+'\output\\'+file_name.replace('\n', '') | |
print(file_name) | |
def timestring(): | |
now = datetime.datetime.now() | |
wholestring = str(now.year)+'-'+str(now.month)+'-'+str(now.day)+'-'+str(now.hour)+'-'+str(now.minute)+'-'+str(now.second) | |
print(wholestring) | |
return wholestring | |
#Delete file contents | |
def ClearFile(): | |
with open(file_name,'w') as f: | |
f.write('"Name","Area","Listed In ","Address","Pincode","Phone Numbers"\n') | |
f.close() | |
def text_merge(input_list): | |
all_items = '"' | |
for i in input_list: | |
if(input_list[-1] == i): | |
all_items = all_items+i | |
else: | |
all_items = all_items+i+',' | |
all_items = all_items+'"' | |
print(all_items) | |
return all_items | |
def text_merge_numbers(input_list): | |
all_items = '' | |
for i in input_list: | |
if(input_list[-1] == i): | |
all_items = all_items+i | |
else: | |
all_items = all_items+i+',' | |
print(all_items) | |
return all_items | |
def seperate(mydoc): | |
y = [] | |
for x in mydoc: | |
y = x.xpath(path_url) | |
#print(y) | |
full_seperate(y) | |
def extract_pincode(string): | |
digitlist = re.findall('\d+',string) | |
for i in digitlist: | |
if(len(i) == 6): | |
return i | |
return '' | |
def internet_on(): | |
try: | |
requests.get('http://216.58.192.142', timeout=1) | |
return True | |
except: | |
return False | |
def Check(): | |
if(internet_on()): | |
#print("Working") | |
return True | |
else: | |
print("Internet Connectivity lost | Trying reconnecting") | |
time.sleep(10) | |
Check() | |
def full_seperate(y): | |
url_two = '' | |
data_two = [] | |
mydoc_three = [] | |
mydoc_four = [] | |
mydoc_phone = [] | |
mydoc_six = [] | |
for i in y: | |
#print(i.attrib['href']) | |
url_two = i.attrib['href'].replace('www','t') | |
if(Check()): | |
try: | |
data_two = requests.get(url_two, headers=mobile_headers) | |
except: | |
print("Internet Connectivity lost ") | |
continue | |
#mydoc_two = html.fromstring(data_two.content).xpath(path_phone) | |
mydoc_three = html.fromstring(data_two.content).xpath(path_name) | |
mydoc_four = html.fromstring(data_two.content).xpath(path_area) | |
mydoc_six = html.fromstring(data_two.content).xpath(path_categories) | |
mydoc_address = html.fromstring(data_two.content).xpath(path_address) | |
mydoc_phone_number = html.fromstring(data_two.content).xpath(path_full_phone_number) | |
print(mydoc_phone_number) | |
if(len(mydoc_address) >= 1): | |
mydoc_pincode = extract_pincode(mydoc_address[0]) | |
else: | |
mydoc_pincode = '' | |
mydoc_five_refine = [] | |
for i in mydoc_phone_number: | |
if(not boolvar.get()): | |
if(len(i) > 9): | |
if('(91)' in i): | |
if(i.count('-') <= 1 ): | |
mydoc_five_refine.append(i) | |
else: | |
if(i.count('-')==0): | |
mydoc_five_refine.append(i) | |
else: | |
if(len(i) > 8): | |
mydoc_five_refine.append(i) | |
if(len(mydoc_six)>0): | |
mydoc_six.pop(len(mydoc_six)-1) | |
with open(file_name,'a') as f: | |
f.write(str(text_merge(mydoc_three))+','+str(text_merge(mydoc_four))+','+str(text_merge(mydoc_six))+','+str(text_merge(mydoc_address)) | |
+','+str(mydoc_pincode)+','+str(text_merge_numbers(mydoc_five_refine))+'\n') | |
def Start(url): | |
for i in range(1,11): | |
url_page = url+'/page-'+str(i) | |
print(str(i)) | |
mydoc = [] | |
data = requests.get(url_page, headers=pc_headers) | |
mydoc = html.fromstring(data.content).xpath(path_init) | |
seperate(mydoc) | |
def Start2(url,driver): | |
driver.get(url) | |
numb = 0 | |
while numb<15: | |
if(Check()): | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
print(numb) | |
numb = numb + 1 | |
time.sleep(10) | |
elem = driver.find_element_by_xpath("//*") | |
source_code = elem.get_attribute("outerHTML") | |
seperate(html.fromstring(source_code)) | |
def date_key(): | |
try: | |
#Getting Created Time is Unique and protects from copying file | |
mtime = os.path.getctime('string.txt') | |
except OSError: | |
mtime = 0 | |
last_modified_date = datetime.datetime.fromtimestamp(mtime) | |
m = hashlib.md5() | |
m.update((str(last_modified_date)).encode('utf-8')) | |
print(m.hexdigest()) | |
hexkey = m.hexdigest() | |
return hexkey | |
def check_key_file(key_string='H'): | |
if((firebase.get('/KEYS',key_string) == 'NONE') or (firebase.get('/KEYS',key_string) == date_key())): | |
f = open("string.txt",'w') | |
f.write(key_string) | |
f.close() | |
firebase.patch('/KEYS',{key_string:date_key()}) | |
#print("FILE CREATED") | |
activated = True | |
# submit_button.config(state=NORMAL) | |
activate_button.config(state=DISABLED) | |
key_filled.config(state=DISABLED) | |
else: | |
#print("INVALID KEY") | |
#result['text'] = "Invalid Key" | |
activated = False | |
submit_button.config(state=DISABLED) | |
def check_key(key_string='H'): | |
if(Check()): | |
if((firebase.get('/KEYS',key_string) == 'NONE') ): | |
print("NEW KEY") | |
f = open("string.txt",'w') | |
f.write(key_string) | |
f.close() | |
firebase.patch('/KEYS',{key_string:date_key()}) | |
#print("FILE CREATED") | |
activated = True | |
submit_button.config(state=NORMAL) | |
activate_button.config(state=DISABLED) | |
key_filled.config(state=DISABLED) | |
else: | |
#print("INVALID KEY") | |
activated = False | |
#result['text'] = "Invalid Key" | |
submit_button['state'] = 'disabled' | |
submit_button.config(state=DISABLED) | |
def key_validation(key_string="H"): | |
#print("ENTERING") | |
try: | |
file = open("string.txt",'r') | |
print("FILE EXIST") | |
the_key = file.read() | |
print(the_key) | |
print(date_key()) | |
print(firebase.get('/KEYS',the_key)) | |
if(firebase.get('/KEYS',the_key) == date_key()): | |
print("KEY MATCHED") | |
activated = True | |
activated_key = file.read() | |
#print(activated_key) | |
submit_button.config(state=NORMAL) | |
activate_button.config(state=DISABLED) | |
key_filled.insert(END,activated_key) | |
key_filled.config(state=DISABLED) | |
else: | |
submit_button.config(state=DISABLED) | |
result['text'] = 'Contact Admin' | |
return | |
except: | |
check_key(key_filled.get()) | |
def End(): | |
print("DONE") | |
result['text'] = 'Finished Scraping ' + 'Check :'+file_name | |
def button_clicked(event): | |
if(submit_button['state'] == 'disabled'): | |
#print("OOPS YOU CANT") | |
return | |
else: | |
url= 'input.txt' | |
result['text'] = "Processing..." | |
driver = webdriver.Firefox() | |
with open(url,'r') as f: | |
for line in f: | |
filenameinit(line) | |
ClearFile() | |
Start2(line,driver) | |
End() | |
def activate_key(event): | |
key_validation(key_filled.get()) | |
print(os.getcwd()) | |
#Region Tk | |
root = Tk() | |
root['height'] = 400 | |
root['width'] = 600 | |
global firebase | |
firebase = firebase.FirebaseApplication('##URL##',None) | |
f1 = Frame(root) | |
f1["height"] = root["height"] | |
f1["width"] = root["width"] | |
root.title("JD Scrapper - Gear Up Studio ") | |
Label(f1,text = "Input Url : Example : ##URL##").grid(row=0,column = 0,) | |
def getBool(event): | |
print(boolvar.get()) | |
#Check Button | |
global boolvar | |
boolvar = BooleanVar() | |
boolvar.set(False) | |
boolvar.trace('w', lambda *_: print("The value was changed")) | |
cb = Checkbutton(f1, text = "Tele Phone number", variable = boolvar) | |
cb.bind("<Button-1>", getBool) | |
cb.grid(row=1, column=1) | |
global key_filled | |
key_filled = Entry(f1,width=50) | |
key_filled.grid(row=2,column=0) | |
key_filled.focus_set() | |
global activate_button | |
activate_button = Button(f1 , text="Active Now") | |
activate_button.bind("<Button-1>",activate_key) | |
activate_button.grid(row=2, column=1) | |
result = Label(f1, width=50) | |
result.grid(row=1,column=2) | |
global submit_button | |
submit_button = Button(f1 , text="Scrap Now") | |
submit_button.bind("<Button-1>",button_clicked) | |
submit_button.grid(row=1, column=0) | |
submit_button.config(state=NORMAL) | |
key_validation() | |
f1.pack() | |
root.mainloop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment