RoyZhengGao/yahoo.py

## yahoo.py
# encoding=utf8
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
from datetime import datetime
import datetime as dt
import codecs
import os
import time,random,re
import urllib
import re

def crawlTopic(n,weblink):
	target = open("yahootopic.csv", 'w')
	target.write("topic_name,topic_time,answer,best_answer,username,answer_time,script_time\n")
	driver=webdriver.Chrome("./chromedriver")

	driver.get(weblink)
	#number times of loading
	for i in range(n):
		print i
		driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")#scroll down to the bottom
		time.sleep(3)#sleep for a while to help load pages

	element = driver.find_elements_by_xpath("//div[@class='Bfc']/h3")
	print len(element)
	#here find every topic
	flag = 0
	for index in range(len(element)):
		print "index",index
		# flag = flag + 1
		# if flag == 3:
		# 	break
		current_time = str(datetime.now())#get current time to crawl the topics
		topic_name = "\""+element[index].text.encode('utf-8').replace("\"","\'")+"\""#get topic name
		topic_link = element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')
		# target.write("<topic>\n")
		# target.write(element[index].text.encode('utf-8')+"\n")
		# target.write(element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')+"\n")#get link
		# print element[index].get_attribute('href')
		detail = element[index].find_element_by_xpath("./../div[@class='Clr-888 Fz-12 Lh-18']")
		# print detail.text
		# target.write(detail.text.encode('utf-8').strip(' \t\n\r')+"\n")#get content
		topic_date = detail.text.encode('utf-8').split(" · ")[2]
		# print current_time
		print topic_name
		# print topic_date
		crawPost(topic_link,target,current_time,topic_name,topic_date)
		# target.write("</topic>\n")
		print "finish current topic"

	target.close()
	driver.quit()

def crawPost(link,target,current_time,topic_name,topic_date):
	r = urllib.urlopen(link).read()
	soup = BeautifulSoup(r)
	page_div=soup.find_all("div", class_="Mstart-75 Pos-r")
	index = 0

	for div in page_div:
		isBest = "0"
		#find post content
		content = div.find_all("span", class_="ya-q-full-text")
		post = "\""+content[0].text.encode('utf-8').replace("\"","\'")+"\""
		index = index + 1
		# print post

		#find user name
		username = ""
		uname = div.find_all("a", class_="uname Clr-b")
		# print "uname",len(uname)
		if len(uname) > 0:
			username = uname[0].text.encode('utf-8')
		#find post time
		time = div.find_all("span", class_="Clr-88 ya-localtime")
		post_time = time[0].text.encode('utf-8').replace(" · ","")

		#find whether it is the best answer
		best = div.find_all("span", class_="Hpx-15 Wpx-14 D-ib shared-sprite win-best-answer-icon-14")
		if len(best) > 0 :
			isBest = "1"
		# print isBest
		# print post_time

		target.write(topic_name+","+topic_date+","+post+","+isBest+","+username+","+post_time+","+current_time+"\n")
		# if index == 5:
		# 	break
		# print index


	#find next page
	next_page=soup.find_all("div", class_="Pstart-75 Bgc-w Lh-16 Bdstart-1g Bdend-1g Py-20")
	if len(next_page) > 0:
		bottom = next_page[0].find_all("a")
		next = "https://answers.yahoo.com"+bottom[len(bottom)-1].get('href')
		print next
		if link != next:
			crawPost(next,target,current_time,topic_name,topic_date)


def main():
	#you decide how many pages of topics to crawl
	crawlTopic(0,"https://answers.yahoo.com/dir/index?sid=396545444")
main()
	# encoding=utf8
	from selenium import webdriver
	from selenium.webdriver.common.keys import Keys
	from bs4 import BeautifulSoup
	from datetime import datetime
	import datetime as dt
	import codecs
	import os
	import time,random,re
	import urllib
	import re

	def crawlTopic(n,weblink):
	target = open("yahootopic.csv", 'w')
	target.write("topic_name,topic_time,answer,best_answer,username,answer_time,script_time\n")
	driver=webdriver.Chrome("./chromedriver")

	driver.get(weblink)
	#number times of loading
	for i in range(n):
	print i
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")#scroll down to the bottom
	time.sleep(3)#sleep for a while to help load pages

	element = driver.find_elements_by_xpath("//div[@class='Bfc']/h3")
	print len(element)
	#here find every topic
	flag = 0
	for index in range(len(element)):
	print "index",index
	# flag = flag + 1
	# if flag == 3:
	# break
	current_time = str(datetime.now())#get current time to crawl the topics
	topic_name = "\""+element[index].text.encode('utf-8').replace("\"","\'")+"\""#get topic name
	topic_link = element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')
	# target.write("<topic>\n")
	# target.write(element[index].text.encode('utf-8')+"\n")
	# target.write(element[index].find_element_by_tag_name('a').get_attribute('href').encode('utf-8')+"\n")#get link
	# print element[index].get_attribute('href')
	detail = element[index].find_element_by_xpath("./../div[@class='Clr-888 Fz-12 Lh-18']")
	# print detail.text
	# target.write(detail.text.encode('utf-8').strip(' \t\n\r')+"\n")#get content
	topic_date = detail.text.encode('utf-8').split(" · ")[2]
	# print current_time
	print topic_name
	# print topic_date
	crawPost(topic_link,target,current_time,topic_name,topic_date)
	# target.write("</topic>\n")
	print "finish current topic"

	target.close()
	driver.quit()

	def crawPost(link,target,current_time,topic_name,topic_date):
	r = urllib.urlopen(link).read()
	soup = BeautifulSoup(r)
	page_div=soup.find_all("div", class_="Mstart-75 Pos-r")
	index = 0

	for div in page_div:
	isBest = "0"
	#find post content
	content = div.find_all("span", class_="ya-q-full-text")
	post = "\""+content[0].text.encode('utf-8').replace("\"","\'")+"\""
	index = index + 1
	# print post

	#find user name
	username = ""
	uname = div.find_all("a", class_="uname Clr-b")
	# print "uname",len(uname)
	if len(uname) > 0:
	username = uname[0].text.encode('utf-8')
	#find post time
	time = div.find_all("span", class_="Clr-88 ya-localtime")
	post_time = time[0].text.encode('utf-8').replace(" · ","")

	#find whether it is the best answer
	best = div.find_all("span", class_="Hpx-15 Wpx-14 D-ib shared-sprite win-best-answer-icon-14")
	if len(best) > 0 :
	isBest = "1"
	# print isBest
	# print post_time

	target.write(topic_name+","+topic_date+","+post+","+isBest+","+username+","+post_time+","+current_time+"\n")
	# if index == 5:
	# break
	# print index


	#find next page
	next_page=soup.find_all("div", class_="Pstart-75 Bgc-w Lh-16 Bdstart-1g Bdend-1g Py-20")
	if len(next_page) > 0:
	bottom = next_page[0].find_all("a")
	next = "https://answers.yahoo.com"+bottom[len(bottom)-1].get('href')
	print next
	if link != next:
	crawPost(next,target,current_time,topic_name,topic_date)


	def main():
	#you decide how many pages of topics to crawl
	crawlTopic(0,"https://answers.yahoo.com/dir/index?sid=396545444")
	main()