Skip to content

Instantly share code, notes, and snippets.

@annecool37
Last active August 20, 2016 19:31
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save annecool37/7e68adf34afda7fd24f95172bb1dcc38 to your computer and use it in GitHub Desktop.
Save annecool37/7e68adf34afda7fd24f95172bb1dcc38 to your computer and use it in GitHub Desktop.
meetup_web_scraping
### Workflow Example:
### 1. Create master meetup soup
### 2. Convert all event url into beautiful soup objects
### 3. Extract number of participants for an event
# 1.
# Pass in the search result url listing events in that area
# Click the "Show More" button and scroll down the page till a pre-specified time show up
# Convert the whole page into a Beautifulsoup object
def get_meet_up_soup(url, to_which_date):
'''create the master meetup soup'''
# create driver for website
driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver')
# wait 10 sec
driver.implicitly_wait(10)
# pass the url to Chrome
driver.get(url)
# locate and click the "Show more" button at the bottom
btn = driver.find_element_by_class_name('simple-post-result-wrap')
btn.click()
driver.implicitly_wait(10)
html = driver.page_source
temp_soup = BeautifulSoup(html, 'lxml')
# Scrape all data till 'to_which_date'
while to_which_date not in temp_soup.find('div', {'id':'docked-event-date'}).get_text():
# scroll to the bottom to reveal more events
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1)
# update html
html = driver.page_source
# convert html into a beautifulsoup object
meet_up_soup = BeautifulSoup(html, 'lxml')
return meet_up_soup
# 2.
def event_url_to_html(meet_up_soup):
'''convert all event url to beautifulsoup objects'''
# get event urls
event_soup = meet_up_soup.find_all('div', 'row-item row-item--shrink text--secondary')
event_url_lst = [(e.a)['href'] for e in event_soup]
# parse all event url the into html
event_html_lst = [BeautifulSoup(requests.get(url).text, 'lxml') for url in event_url_lst]
return event_html_lst
# Instead of writing tons of try and except, which are incompatible with list comprehension
# This exception catcher function can be implemented into list comprehension
def catch(func, handle=lambda e : e, *args, **kwargs):
'''Catch exception and return NA if no such attribute is found'''
try:
return func(*args, **kwargs)
except Exception as e:
return 'NA'
# 3.
def get_participant_count(meet_up_soup):
'''get number of participants '''
numPeopleTags = meet_up_soup.find_all('div', {'class': "attendee-count"})
participant_count = [catch(lambda: int(re.search(r'\d+', tag.get_text()).group())) for tag in numPeopleTags]
return participant_count
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment