Skip to content

Instantly share code, notes, and snippets.

@RoyZhengGao
Created October 30, 2016 22:34
Show Gist options
  • Save RoyZhengGao/da6d4ab736a8aa3416bd854be9e4e71e to your computer and use it in GitHub Desktop.
Save RoyZhengGao/da6d4ab736a8aa3416bd854be9e4e71e to your computer and use it in GitHub Desktop.
Yahoo_answer_crawler
# encoding=utf8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import datetime
import datetime as dt
import codecs
import os
import time,random,re
import urllib
import re
def crawlTopic(n,weblink):
target = open("yahootopic.csv", 'w')
target.write("topic_name,topic_time,answer,best_answer,username,answer_time,script_time\n")
driver=webdriver.Chrome("./chromedriver")
driver.get(weblink)
#number times of loading
for i in range(n):
print i
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")#scroll down to the bottom
time.sleep(3)#sleep for a while to help load pages
element = driver.find_elements_by_xpath("//div[@class='Bfc']/h3")
print len(element)
#here find every topic
flag = 0
for index in range(len(element)):
print "index",index
# flag = flag + 1
# if flag == 3:
# break
current_time = str(datetime.now())#get current time to crawl the topics
topic_name = "\""+element[index].text.encode('utf-8').replace("\"","\'")+"\""#get topic name
topic_link = element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')
# target.write("<topic>\n")
# target.write(element[index].text.encode('utf-8')+"\n")
# target.write(element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')+"\n")#get link
# print element[index].get_attribute('href')
detail = element[index].find_element_by_xpath("./../div[@class='Clr-888 Fz-12 Lh-18']")
# print detail.text
# target.write(detail.text.encode('utf-8').strip(' \t\n\r')+"\n")#get content
topic_date = detail.text.encode('utf-8').split(" · ")[2]
# print current_time
print topic_name
# print topic_date
crawPost(topic_link,target,current_time,topic_name,topic_date)
# target.write("</topic>\n")
print "finish current topic"
target.close()
driver.quit()
def crawPost(link,target,current_time,topic_name,topic_date):
r = urllib.urlopen(link).read()
soup = BeautifulSoup(r)
page_div=soup.find_all("div", class_="Mstart-75 Pos-r")
index = 0
for div in page_div:
isBest = "0"
#find post content
content = div.find_all("span", class_="ya-q-full-text")
post = "\""+content[0].text.encode('utf-8').replace("\"","\'")+"\""
index = index + 1
# print post
#find user name
username = ""
uname = div.find_all("a", class_="uname Clr-b")
# print "uname",len(uname)
if len(uname) > 0:
username = uname[0].text.encode('utf-8')
#find post time
time = div.find_all("span", class_="Clr-88 ya-localtime")
post_time = time[0].text.encode('utf-8').replace(" · ","")
#find whether it is the best answer
best = div.find_all("span", class_="Hpx-15 Wpx-14 D-ib shared-sprite win-best-answer-icon-14")
if len(best) > 0 :
isBest = "1"
# print isBest
# print post_time
target.write(topic_name+","+topic_date+","+post+","+isBest+","+username+","+post_time+","+current_time+"\n")
# if index == 5:
# break
# print index
#find next page
next_page=soup.find_all("div", class_="Pstart-75 Bgc-w Lh-16 Bdstart-1g Bdend-1g Py-20")
if len(next_page) > 0:
bottom = next_page[0].find_all("a")
next = "https://answers.yahoo.com"+bottom[len(bottom)-1].get('href')
print next
if link != next:
crawPost(next,target,current_time,topic_name,topic_date)
def main():
#you decide how many pages of topics to crawl
crawlTopic(0,"https://answers.yahoo.com/dir/index?sid=396545444")
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment