Skip to content

Instantly share code, notes, and snippets.

@ketankr9
Last active August 4, 2018 05:05
Show Gist options
  • Save ketankr9/6efdc59724c90b361a4c4e49f641283b to your computer and use it in GitHub Desktop.
Save ketankr9/6efdc59724c90b361a4c4e49f641283b to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup as bs
import re
import csv
TAGS=["python"]
MAX_PAGES_PER_TAGS = 20 #int
FILENAME_PREFIX = "Dataset_"
# def writeToCSV(fileName):
# global FILENAME_PREFIX
# with open(FILENAME_PREFIX + fileName+'.csv', 'wb') as csvfile:
# spamwriter = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL)
# spamwriter.writerow(['Spam'] * 5 + ['Baked Beans'])
# spamwriter.writerow(['Spam', 'Lovely Spam', 'Wonderful Spam'])
flag = True
def get_data(url):
data = requests.get(url)
return data.content
def extractUrl(url, csvWriter):
global flag
soup = bs(get_data(url), "lxml")
qs = soup.findAll("div", {"class":"question-summary"})
if len(qs) == 0:
flag = False
return
for i in range(len(qs)):
###
## Time
ans=[]
try:
q = qs[i].find("div", {"class":"summary"}).find("span", {"class":"relativetime"})["title"].encode("utf-8")
ans.append(q)
except:
ans.append("None")
## Rating/Reputation
try:
z = qs[i].find("div", {"class":"summary"}).findAll("div", {"class":"user-details"})
if len(z) == 1:
w1 = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("span", {"class":"reputation-score", "title":re.compile(r"reputation(\s)score(\s?)")})#.string.encode("utf-8")
w2 = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("span", {"class":"reputation-score", "title":re.compile(r"reputation(\s)score(\s){1}(.+)")})
if w2:
ans.append(w2["title"][len("reputation score "):])
elif w1:
ans.append(w1.string.encode("utf-8"))
else:
ans.append("None")
elif len(z) == 2:
ans.append("None")
except:
ans.append("None")
## Username and link
try:
e = qs[i].find("div", {"class":"summary"}).find("div", {"class":"user-details"}).find("a").string.encode("utf-8")
ans.append(e)
except:
e = qs[i].find("div", {"class":"summary"}).find("div", {"class":re.compile(r"user-info(\s?)")}).find("div", {"class":"user-details"})
if e.find("span", {"class":"community-wiki"}):
ans.append("community-wiki")
else:
ans.append(e.get_text().strip().encode("utf-8"))
## Votes
try:
r = qs[i].find("div", {"class":"statscontainer"}).find("span", {"class": re.compile(r"vote-count-post(\s*)")}).string.encode("utf-8")
ans.append(r)
except:
print("Error")
ans.append("None")
try:
## Answers accepted
t = qs[i].find("div", {"class":"statscontainer"}).find("div", {"class":"status"}).find("strong").string.encode("utf-8")
ans.append(t)
except:
print("Error")
ans.append("None")
try:
## Views
## Views # views supernova
y = qs[i].find("div", {"class":"statscontainer"}).find("div", {"class":re.compile(r"(views(\s))")})["title"].encode("utf-8")[:-6]
ans.append(y)
except:
print("Error")
ans.append("None")
print(ans)
csvWriter.writerow(ans)
# print("---------------------------------")
for tag_i in TAGS:
csvfile = open(FILENAME_PREFIX + tag_i +'.csv', 'wb')
csv_writer = csv.writer(csvfile , lineterminator="\n")
pageNo = 1
while flag == True and pageNo <= MAX_PAGES_PER_TAGS:
url = "https://stackoverflow.com/questions/tagged/"+tag_i.lower()+"?page="+str(pageNo)+"&sort=frequent&pageSize=50"
pageNo += 1
extractUrl(url, csv_writer)
csvfile.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment