Skip to content

Instantly share code, notes, and snippets.

View lakshay-arora's full-sized avatar
🇮🇳

Lakshay lakshay-arora

🇮🇳
  • Walmart
  • Bengaluru
View GitHub Profile
%%time
data['Number_of_divisor'] = data.Number.apply(countDivisors)
%%time
pool = mp.Pool(processes = (mp.cpu_count() - 1))
answer = pool.map(countDivisors,random_data)
pool.close()
pool.join()
"""
Web Scraping - Beautiful Soup
"""
# importing required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
# target URL to scrap
# find all the sections with specifiedd class name
cards_data = data.find_all('div', attrs={'class', 'width100 fl htlListSeo hotel-tile-srp-container hotel-tile-srp-container-template new-htl-design-tile-main-block'})
# total number of cards
print('Total Number of Cards Found : ', len(cards_data))
# source code of hotel cards
for card in cards_data:
print(card)
# extract the hotel name and price per room
for card in cards_data:
# get the hotel name
hotel_name = card.find('p')
# get the room price
room_price = card.find('li', attrs={'class': 'htl-tile-discount-prc'})
print(hotel_name.text, room_price.text)
# create a list to store the data
scraped_data = []
for card in cards_data:
# initialize the dictionary
card_details = {}
# get the hotel name
hotel_name = card.find('p')
"""
Web Scraping - Scrap Images
"""
# importing required libraries
import requests
from bs4 import BeautifulSoup
# target URL
url = "https://www.goibibo.com/hotels/hotels-in-shimla-ct/"
# select src tag
image_src = [x['src'] for x in images]
# select only jp format images
image_src = [x for x in image_src if x.endswith('.jpg')]
for image in image_src:
print(image)
image_count = 1
for image in image_src:
with open('image_'+str(image_count)+'.jpg', 'wb') as f:
res = requests.get(image)
f.write(res.content)
image_count = image_count+1
@lakshay-arora
lakshay-arora / lazy_1.py
Last active October 14, 2019 07:10
Lazy Evaluation in Spark Part-1
# create a sample list
my_list = [i for i in range(1,10000000)]
# parallelize the data
rdd_0 = sc.parallelize(my_list,3)
rdd_0