Skip to content

Instantly share code, notes, and snippets.

@nveenverma
Created June 14, 2019 07:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nveenverma/38898fcfce6c21c1518367d18dc97152 to your computer and use it in GitHub Desktop.
Save nveenverma/38898fcfce6c21c1518367d18dc97152 to your computer and use it in GitHub Desktop.
def get_top_questions(url, question_count):
# WARNING: Only enter one of these 3 values [15, 30, 50].
# Since, stackoverflow, doesn't display any other size questions list
url = url + "?sort=votes&pagesize={}".format(question_count)
# Using requests module for downloading webpage content
response = requests.get(url)
# Parsing html data using BeautifulSoup
soup = bs(response.content, 'html.parser')
body = soup.find('body')
# Extracting Top Questions
question_links = body1.select("h3 a.question-hyperlink")
error_checking(question_links, question_count) # Error Checking
questions = [i.text for i in question_links] # questions list
# Extracting Summary
summary_divs = body1.select("div.excerpt")
error_checking(summary_divs, question_count) # Error Checking
summaries = [i.text.strip() for i in summary_divs] # summaries list
# Extracting Tags
tags_divs = body1.select("div.summary > div:nth-of-type(2)")
error_checking(tags_divs, question_count) # Error Checking
a_tags_list = [i.select('a') for i in tags_divs] # tag links
tags = []
for a_group in a_tags_list:
tags.append([a.text for a in a_group]) # tags list
# Extracting Number of votes
vote_spans = body1.select("span.vote-count-post strong")
error_checking(vote_spans, question_count) # Error Checking
no_of_votes = [int(i.text) for i in vote_spans] # votes list
# Extracting Number of answers
answer_divs = body1.select("div.status strong")
error_checking(answer_divs, question_count) # Error Checking
no_of_answers = [int(i.text) for i in answer_divs] # answers list
# Extracting Number of views
div_views = body1.select("div.supernova")
error_checking(div_views, question_count) # Error Checking
no_of_views = [i['title'] for i in div_views]
no_of_views = [i[:-6].replace(',', '') for i in no_of_views]
no_of_views = [int(i) for i in no_of_views] # views list
# Putting all of them together
df = pd.DataFrame({'question': questions,
'summary': summaries,
'tags': tags,
'no_of_votes': no_of_votes,
'no_of_answers': no_of_answers,
'no_of_views': no_of_views})
return df
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment