Skip to content

Instantly share code, notes, and snippets.

@Keithwachira
Created November 18, 2019 15:22
Show Gist options
  • Save Keithwachira/feff1400eb270c60f3655b188f14b0e2 to your computer and use it in GitHub Desktop.
Save Keithwachira/feff1400eb270c60f3655b188f14b0e2 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import csv
def harvest_data():
req = requests.get('https://www.jumia.co.ke')
data = req.text
soup = BeautifulSoup(data, 'html.parser')
# print (soup.prettify())
menu_elem = soup.find('ul', {'class': 'menu-items'})
menu_entries = menu_elem.find_all('li', {'class': 'menu-item'})
menu_items = []
for elem in menu_entries:
entry = process_entry(elem)
menu_items.append(entry)
return menu_items
def process_entry(menu_elem):
name_span = menu_elem.find('span', {'class': 'nav-subTxt'})
title = name_span.text
href_anchor = menu_elem.find('a', {'class': 'main-category'}, href=True)
print(href_anchor)
try:
main_href = href_anchor['href']
except:
main_href = ''
sub_menu_elem = menu_elem.find('div', {'class': 'submenu'})
categories_elem = sub_menu_elem.find_all('div', {'class': 'categories'})
categories = {}
for elem in categories_elem:
cat_elem = elem.find('a', {'class': 'category'}, href=True)
try:
category = cat_elem.text
category_href = cat_elem['href']
except AttributeError:
continue
sub_categories = []
sub_elems = elem.find_all('a', {'class': 'subcategory'}, href=True)
for sub_elem in sub_elems:
sub_cat = sub_elem.text
sub_href = sub_elem['href']
sub_categories.append((sub_cat, sub_href))
categories[category] = (category_href, sub_categories)
return ((title, main_href), categories)
def write_data(data, outpath):
rows = []
title_id = 1
category_id = 1000
sub_cateory_id = 10000
for row in data:
title_href, entries = row
title, href = title_href
rows.append((title_id, title, '', href))
for category, sub_categories_href in entries.items():
cat_href, sub_categories = sub_categories_href
rows.append((category_id, category, title_id, cat_href))
for sub_cat_href in sub_categories:
sub_cat, sub_href = sub_cat_href
rows.append((sub_cateory_id, sub_cat, category_id, sub_href))
sub_cateory_id += 1
category_id += 1
title_id += 1
headers = ['id', 'title', 'parent_id', 'category_url']
with open(outpath, 'w') as data_file:
writer = csv.writer(data_file)
writer.writerow(headers)
writer.writerows(rows)
if __name__ == '__main__':
path = '/Users/keithwacira/go/src/data/categories.csv'
data = harvest_data()
write_data(data, path)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment