Created
October 30, 2016 22:34
-
-
Save RoyZhengGao/da6d4ab736a8aa3416bd854be9e4e71e to your computer and use it in GitHub Desktop.
Yahoo_answer_crawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding=utf8 | |
from selenium import webdriver | |
from selenium.webdriver.common.keys import Keys | |
from bs4 import BeautifulSoup | |
from datetime import datetime | |
import datetime as dt | |
import codecs | |
import os | |
import time,random,re | |
import urllib | |
import re | |
def crawlTopic(n,weblink): | |
target = open("yahootopic.csv", 'w') | |
target.write("topic_name,topic_time,answer,best_answer,username,answer_time,script_time\n") | |
driver=webdriver.Chrome("./chromedriver") | |
driver.get(weblink) | |
#number times of loading | |
for i in range(n): | |
print i | |
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")#scroll down to the bottom | |
time.sleep(3)#sleep for a while to help load pages | |
element = driver.find_elements_by_xpath("//div[@class='Bfc']/h3") | |
print len(element) | |
#here find every topic | |
flag = 0 | |
for index in range(len(element)): | |
print "index",index | |
# flag = flag + 1 | |
# if flag == 3: | |
# break | |
current_time = str(datetime.now())#get current time to crawl the topics | |
topic_name = "\""+element[index].text.encode('utf-8').replace("\"","\'")+"\""#get topic name | |
topic_link = element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8') | |
# target.write("<topic>\n") | |
# target.write(element[index].text.encode('utf-8')+"\n") | |
# target.write(element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')+"\n")#get link | |
# print element[index].get_attribute('href') | |
detail = element[index].find_element_by_xpath("./../div[@class='Clr-888 Fz-12 Lh-18']") | |
# print detail.text | |
# target.write(detail.text.encode('utf-8').strip(' \t\n\r')+"\n")#get content | |
topic_date = detail.text.encode('utf-8').split(" · ")[2] | |
# print current_time | |
print topic_name | |
# print topic_date | |
crawPost(topic_link,target,current_time,topic_name,topic_date) | |
# target.write("</topic>\n") | |
print "finish current topic" | |
target.close() | |
driver.quit() | |
def crawPost(link,target,current_time,topic_name,topic_date): | |
r = urllib.urlopen(link).read() | |
soup = BeautifulSoup(r) | |
page_div=soup.find_all("div", class_="Mstart-75 Pos-r") | |
index = 0 | |
for div in page_div: | |
isBest = "0" | |
#find post content | |
content = div.find_all("span", class_="ya-q-full-text") | |
post = "\""+content[0].text.encode('utf-8').replace("\"","\'")+"\"" | |
index = index + 1 | |
# print post | |
#find user name | |
username = "" | |
uname = div.find_all("a", class_="uname Clr-b") | |
# print "uname",len(uname) | |
if len(uname) > 0: | |
username = uname[0].text.encode('utf-8') | |
#find post time | |
time = div.find_all("span", class_="Clr-88 ya-localtime") | |
post_time = time[0].text.encode('utf-8').replace(" · ","") | |
#find whether it is the best answer | |
best = div.find_all("span", class_="Hpx-15 Wpx-14 D-ib shared-sprite win-best-answer-icon-14") | |
if len(best) > 0 : | |
isBest = "1" | |
# print isBest | |
# print post_time | |
target.write(topic_name+","+topic_date+","+post+","+isBest+","+username+","+post_time+","+current_time+"\n") | |
# if index == 5: | |
# break | |
# print index | |
#find next page | |
next_page=soup.find_all("div", class_="Pstart-75 Bgc-w Lh-16 Bdstart-1g Bdend-1g Py-20") | |
if len(next_page) > 0: | |
bottom = next_page[0].find_all("a") | |
next = "https://answers.yahoo.com"+bottom[len(bottom)-1].get('href') | |
print next | |
if link != next: | |
crawPost(next,target,current_time,topic_name,topic_date) | |
def main(): | |
#you decide how many pages of topics to crawl | |
crawlTopic(0,"https://answers.yahoo.com/dir/index?sid=396545444") | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment