Skip to content

Instantly share code, notes, and snippets.

@abhigenie92
Created June 26, 2015 16:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save abhigenie92/7505af94b045d729d62e to your computer and use it in GitHub Desktop.
Save abhigenie92/7505af94b045d729d62e to your computer and use it in GitHub Desktop.
Python
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import urllib,requests,unidecode,lxml
class wait_for_more_than_n_elements_to_be_present(object):
def __init__(self, locator, count):
self.locator = locator
self.count = count
def __call__(self, driver):
try:
elements = EC._find_elements(driver, self.locator)
return len(elements) > self.count
except StaleElementReferenceException:
return False
def return_html_code(url):
driver = webdriver.Firefox()
driver.maximize_window()
driver.get(url)
# initial wait for the tweets to load
wait = WebDriverWait(driver, 10)
wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "li[data-item-id]")))
# scroll down to the last tweet until there is no more tweets loaded
while True:
tweets = driver.find_elements_by_css_selector("li[data-item-id]")
number_of_tweets = len(tweets)
print number_of_tweets
driver.execute_script("arguments[0].scrollIntoView();", tweets[-1])
try:
wait.until(wait_for_more_than_n_elements_to_be_present((By.CSS_SELECTOR, "li[data-item-id]"), number_of_tweets))
except TimeoutException:
break
html_full_source=driver.page_source
driver.close()
return html_full_source
url='https://twitter.com/thecoolstacks'
#using selenium browser
html_source=return_html_code(url)
soup = BeautifulSoup(html_source, "lxml")
for tweet in soup.select("div.tweet div.content"):
print tweet.p.text
#using request modules
# if False:
# req = requests.get(url)
# soup = BeautifulSoup(req.content)
# text_tweet=[]
# alltweets = soup.find_all(attrs={'data-item-type' : 'tweet'})
# for tweet in alltweets:
# #Text of tweet
# html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
# text_tweet.append(''.join(html_tweet[0].findAll(text=True)))
# print text_tweet
#finalcode
# alltweets_selenium = soup_selenium.find_all(attrs={'data-item-type' : 'tweet'})
# for tweet in alltweets_selenium:
# #Text of tweet
# html_tweet= tweet.find_all("p", class_="TweetTextSize TweetTextSize--16px js-tweet-text tweet-text")
# text_tweet.append(''.join(html_tweet[0].findAll(text=True)))
# print text_tweet
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment