Last active
August 20, 2016 19:31
-
-
Save annecool37/7e68adf34afda7fd24f95172bb1dcc38 to your computer and use it in GitHub Desktop.
meetup_web_scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### Workflow Example: | |
### 1. Create master meetup soup | |
### 2. Convert all event url into beautiful soup objects | |
### 3. Extract number of participants for an event | |
# 1. | |
# Pass in the search result url listing events in that area | |
# Click the "Show More" button and scroll down the page till a pre-specified time show up | |
# Convert the whole page into a Beautifulsoup object | |
def get_meet_up_soup(url, to_which_date): | |
'''create the master meetup soup''' | |
# create driver for website | |
driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver') | |
# wait 10 sec | |
driver.implicitly_wait(10) | |
# pass the url to Chrome | |
driver.get(url) | |
# locate and click the "Show more" button at the bottom | |
btn = driver.find_element_by_class_name('simple-post-result-wrap') | |
btn.click() | |
driver.implicitly_wait(10) | |
html = driver.page_source | |
temp_soup = BeautifulSoup(html, 'lxml') | |
# Scrape all data till 'to_which_date' | |
while to_which_date not in temp_soup.find('div', {'id':'docked-event-date'}).get_text(): | |
# scroll to the bottom to reveal more events | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
time.sleep(1) | |
# update html | |
html = driver.page_source | |
# convert html into a beautifulsoup object | |
meet_up_soup = BeautifulSoup(html, 'lxml') | |
return meet_up_soup | |
# 2. | |
def event_url_to_html(meet_up_soup): | |
'''convert all event url to beautifulsoup objects''' | |
# get event urls | |
event_soup = meet_up_soup.find_all('div', 'row-item row-item--shrink text--secondary') | |
event_url_lst = [(e.a)['href'] for e in event_soup] | |
# parse all event url the into html | |
event_html_lst = [BeautifulSoup(requests.get(url).text, 'lxml') for url in event_url_lst] | |
return event_html_lst | |
# Instead of writing tons of try and except, which are incompatible with list comprehension | |
# This exception catcher function can be implemented into list comprehension | |
def catch(func, handle=lambda e : e, *args, **kwargs): | |
'''Catch exception and return NA if no such attribute is found''' | |
try: | |
return func(*args, **kwargs) | |
except Exception as e: | |
return 'NA' | |
# 3. | |
def get_participant_count(meet_up_soup): | |
'''get number of participants ''' | |
numPeopleTags = meet_up_soup.find_all('div', {'class': "attendee-count"}) | |
participant_count = [catch(lambda: int(re.search(r'\d+', tag.get_text()).group())) for tag in numPeopleTags] | |
return participant_count |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment