Skip to content

Instantly share code, notes, and snippets.

@pwcahyo
Created January 31, 2016 13:47
Show Gist options
  • Save pwcahyo/28ce1f511246f51a422d to your computer and use it in GitHub Desktop.
Save pwcahyo/28ce1f511246f51a422d to your computer and use it in GitHub Desktop.
tweet
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class TwitterscrapingItem(scrapy.Item):
index = scrapy.Field()
userid = scrapy.Field()
username = scrapy.Field()
fullname = scrapy.Field()
text_tweet = scrapy.Field()
original_text_tweet = scrapy.Field()
max_position = scrapy.Field()
hash_tags = scrapy.Field()
time_tweet = scrapy.Field()
lang = scrapy.Field()
retweets = scrapy.Field()
favorite = scrapy.Field()
place_id = scrapy.Field()
place = scrapy.Field()
from scrapy.spiders import Spider
from scrapy.http.request import Request
from scrapy.selector import Selector
from twitterscraping.items import TwitterscrapingItem
from scrapy.http.headers import Headers
from w3lib.html import remove_tags
import json
class TwitterSpider(Spider):
index = 0
start = '2016-01-25'
end = '2016-01-26'
name = "twitter"
allowed_domains = ["twitter.com"]
start_urls = [
"https://twitter.com/search?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+start+"%20until%3A"+end+"&src=typd",
"https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+start+"%20until%3A"+end+"&src=typd&include_available_features=1&include_entities=1&max_position=TWEET-691759856034910209-691771627034517505-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false"
]
def parse(self, response):
max_position = ''
koma = ','
headers = response.headers
itemselector = Selector(response).xpath('//div[@class="content"]')
if headers['Content-Type'] == 'application/json;charset=utf-8':
data = json.loads(response.body)
itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]')
max_position = data['min_position']
yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&last_note_ts=103&max_position="+max_position+"&reset_error_state=false",
callback=self.parse,
method="GET",)
for sel in itemselector:
self.index += 1
item = TwitterscrapingItem()
item['index'] = self.index
item['userid'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract()))
item['username'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract()))
item['fullname'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract()))
text_tweet = ''.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]').extract()))
item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ')
item['original_text_tweet'] = text_tweet
hash_tags = koma.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]'
'/a[@class="twitter-hashtag pretty-link js-nav"]').extract()))
item['hash_tags'] = remove_tags(hash_tags)
item['time_tweet'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract()))
item['lang'] = ''.join(
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]/@lang').extract()))
retweets = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
'/div[@class="ProfileTweet-actionList js-actions"]'
'/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]'
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionRetweet"]'
'/div[@class="IconTextContainer"]').extract()))
item['retweets'] = remove_tags(retweets).strip()
favorite = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]'
'/div[@class="ProfileTweet-actionList js-actions"]'
'/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]'
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]'
'/div[@class="IconTextContainer"]').extract()))
item['favorite'] = remove_tags(favorite).strip()
item['place_id'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract()))
item['place'] = ''.join(
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract()))
item['max_position'] = max_position
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment