annecool37/meetup_web_scraping.py

## meetup_web_scraping.py
### Workflow Example:
### 1. Create master meetup soup
### 2. Convert all event url into beautiful soup objects
### 3. Extract number of participants for an event

# 1.
# Pass in the search result url listing events in that area
# Click the "Show More" button and scroll down the page till a pre-specified time show up
# Convert the whole page into a Beautifulsoup object
def get_meet_up_soup(url, to_which_date):
    '''create the master meetup soup'''
    # create driver for website
    driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver')
    # wait 10 sec
    driver.implicitly_wait(10)
    # pass the url to Chrome
    driver.get(url)
    # locate and click the "Show more" button at the bottom
    btn = driver.find_element_by_class_name('simple-post-result-wrap')
    btn.click()
    driver.implicitly_wait(10)
    html = driver.page_source
    temp_soup = BeautifulSoup(html, 'lxml')
    # Scrape all data till 'to_which_date'
    while to_which_date not in temp_soup.find('div', {'id':'docked-event-date'}).get_text():
        # scroll to the bottom to reveal more events
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        # update html
        html = driver.page_source
        # convert html into a beautifulsoup object
        meet_up_soup = BeautifulSoup(html, 'lxml')
    return meet_up_soup

# 2.
def event_url_to_html(meet_up_soup):
    '''convert all event url to beautifulsoup objects'''
    # get event urls
    event_soup = meet_up_soup.find_all('div', 'row-item row-item--shrink text--secondary')
    event_url_lst = [(e.a)['href'] for e in event_soup]
    # parse all event url the into html
    event_html_lst = [BeautifulSoup(requests.get(url).text, 'lxml') for url in event_url_lst]
    return event_html_lst

# Instead of writing tons of try and except, which are incompatible with list comprehension
# This exception catcher function can be implemented into list comprehension
def catch(func, handle=lambda e : e, *args, **kwargs):
    '''Catch exception and return NA if no such attribute is found'''
    try:
        return func(*args, **kwargs)
    except Exception as e:
        return 'NA'
# 3.
def get_participant_count(meet_up_soup):
    '''get number of participants '''
    numPeopleTags = meet_up_soup.find_all('div', {'class': "attendee-count"})
    participant_count = [catch(lambda: int(re.search(r'\d+', tag.get_text()).group())) for tag in numPeopleTags]
    return participant_count
	### Workflow Example:
	### 1. Create master meetup soup
	### 2. Convert all event url into beautiful soup objects
	### 3. Extract number of participants for an event

	# 1.
	# Pass in the search result url listing events in that area
	# Click the "Show More" button and scroll down the page till a pre-specified time show up
	# Convert the whole page into a Beautifulsoup object
	def get_meet_up_soup(url, to_which_date):
	'''create the master meetup soup'''
	# create driver for website
	driver = webdriver.Chrome('/Users/annecool37/Documents/chromedriver')
	# wait 10 sec
	driver.implicitly_wait(10)
	# pass the url to Chrome
	driver.get(url)
	# locate and click the "Show more" button at the bottom
	btn = driver.find_element_by_class_name('simple-post-result-wrap')
	btn.click()
	driver.implicitly_wait(10)
	html = driver.page_source
	temp_soup = BeautifulSoup(html, 'lxml')
	# Scrape all data till 'to_which_date'
	while to_which_date not in temp_soup.find('div', {'id':'docked-event-date'}).get_text():
	# scroll to the bottom to reveal more events
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(1)
	# update html
	html = driver.page_source
	# convert html into a beautifulsoup object
	meet_up_soup = BeautifulSoup(html, 'lxml')
	return meet_up_soup

	# 2.
	def event_url_to_html(meet_up_soup):
	'''convert all event url to beautifulsoup objects'''
	# get event urls
	event_soup = meet_up_soup.find_all('div', 'row-item row-item--shrink text--secondary')
	event_url_lst = [(e.a)['href'] for e in event_soup]
	# parse all event url the into html
	event_html_lst = [BeautifulSoup(requests.get(url).text, 'lxml') for url in event_url_lst]
	return event_html_lst

	# Instead of writing tons of try and except, which are incompatible with list comprehension
	# This exception catcher function can be implemented into list comprehension
	def catch(func, handle=lambda e : e, args, *kwargs):
	'''Catch exception and return NA if no such attribute is found'''
	try:
	return func(args, *kwargs)
	except Exception as e:
	return 'NA'
	# 3.
	def get_participant_count(meet_up_soup):
	'''get number of participants '''
	numPeopleTags = meet_up_soup.find_all('div', {'class': "attendee-count"})
	participant_count = [catch(lambda: int(re.search(r'\d+', tag.get_text()).group())) for tag in numPeopleTags]
	return participant_count