Skip to content

Instantly share code, notes, and snippets.

@git-bhanu
Created June 4, 2021 09:31
Show Gist options
  • Save git-bhanu/bdf1ffff7c0a99d03327b7e4434d01fc to your computer and use it in GitHub Desktop.
Save git-bhanu/bdf1ffff7c0a99d03327b7e4434d01fc to your computer and use it in GitHub Desktop.
python script to scrape data
import pandas as pd
from tqdm import tqdm
from selenium.webdriver import Chrome
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import TimeoutException
from time import sleep
data = pd.read_csv("D:\Scrape\WooCommerce Plugin\woocommerce_resultData.csv")
driver = Chrome(executable_path='C:\chromedriver_win32\chromedriver.exe')
driver.set_page_load_timeout(10)
bar = tqdm(data.index)
for ind in bar:
bar.set_postfix({'Package': data['name'][ind] })
x = data['content'][ind]
if str(x)=='nan' and type(x)!='str':
url = data['link'][ind]
link = url[:url.find('?')]
try:
driver.get(link)
except TimeoutException:
continue
except:
continue
try:
MainContent = driver.find_element_by_css_selector(".entry-content")
except:
continue
# Title Of the Product
try:
productName = MainContent.find_element_by_css_selector(".product-new-header h1").get_attribute('innerHTML')
except:
productName = False
js = "var aa=document.getElementsByClassName('product-new-header')[0];aa.parentNode.removeChild(aa)"
driver.execute_script(js)
# Title Of the Product
try:
productContent = driver.find_element_by_css_selector(".entry-content").text;
except:
productContent = False
# Content Of the Product
# Price of the product
try:
Price = driver.find_element_by_css_selector(".product-new-sidebar .tier-price").text;
except NoSuchElementException:
try:
Price = driver.find_element_by_css_selector("button[name=buy-now-button]").text
except NoSuchElementException:
Price = False
# Title Of the Product
try:
Category_ = driver.find_element_by_css_selector(".details-category").find_elements_by_tag_name("a")
category = []
for lnk in Category_:
category.append(lnk.text)
final_category = ",".join(category)
except:
final_category = False
data.loc[ind, 'link'] = link
data.loc[ind, 'name'] = productName
data.loc[ind, 'content'] = productContent
data.loc[ind, 'price'] = Price
data.loc[ind, 'categories'] = final_category
data.to_csv(r'D:\Scrape\WooCommerce Plugin\woocommerce_resultData.csv', index=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment