Created
January 31, 2016 13:47
-
-
Save pwcahyo/28ce1f511246f51a422d to your computer and use it in GitHub Desktop.
tweet
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# Define here the models for your scraped items | |
# | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
import scrapy | |
class TwitterscrapingItem(scrapy.Item): | |
index = scrapy.Field() | |
userid = scrapy.Field() | |
username = scrapy.Field() | |
fullname = scrapy.Field() | |
text_tweet = scrapy.Field() | |
original_text_tweet = scrapy.Field() | |
max_position = scrapy.Field() | |
hash_tags = scrapy.Field() | |
time_tweet = scrapy.Field() | |
lang = scrapy.Field() | |
retweets = scrapy.Field() | |
favorite = scrapy.Field() | |
place_id = scrapy.Field() | |
place = scrapy.Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.spiders import Spider | |
from scrapy.http.request import Request | |
from scrapy.selector import Selector | |
from twitterscraping.items import TwitterscrapingItem | |
from scrapy.http.headers import Headers | |
from w3lib.html import remove_tags | |
import json | |
class TwitterSpider(Spider): | |
index = 0 | |
start = '2016-01-25' | |
end = '2016-01-26' | |
name = "twitter" | |
allowed_domains = ["twitter.com"] | |
start_urls = [ | |
"https://twitter.com/search?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+start+"%20until%3A"+end+"&src=typd", | |
"https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+start+"%20until%3A"+end+"&src=typd&include_available_features=1&include_entities=1&max_position=TWEET-691759856034910209-691771627034517505-BD1UO2FFu9QAAAAAAAAETAAAAAcAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA&reset_error_state=false" | |
] | |
def parse(self, response): | |
max_position = '' | |
koma = ',' | |
headers = response.headers | |
itemselector = Selector(response).xpath('//div[@class="content"]') | |
if headers['Content-Type'] == 'application/json;charset=utf-8': | |
data = json.loads(response.body) | |
itemselector = Selector(text=data['items_html']).xpath('//div[@class="content"]') | |
max_position = data['min_position'] | |
yield Request("https://twitter.com/i/search/timeline?f=tweets&vertical=default&q=%22demam%20berdarah%22%20OR%20dbd%20OR%20dhf%20OR%20%22dengue%20fever%22%20OR%20%22dengue%20hemorrhagic%22%20OR%20%22sakit%20db%22%20lang%3Aid%20since%3A"+self.start+"%20until%3A"+self.end+"&src=typd&include_available_features=1&include_entities=1&last_note_ts=103&max_position="+max_position+"&reset_error_state=false", | |
callback=self.parse, | |
method="GET",) | |
for sel in itemselector: | |
self.index += 1 | |
item = TwitterscrapingItem() | |
item['index'] = self.index | |
item['userid'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/@data-user-id').extract())) | |
item['username'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/span[@class="username js-action-profile-name"]/b/text()').extract())) | |
item['fullname'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/a/strong/text()').extract())) | |
text_tweet = ''.join( | |
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]').extract())) | |
item['text_tweet'] = remove_tags(text_tweet).replace('\n',' ').replace('\u',' ') | |
item['original_text_tweet'] = text_tweet | |
hash_tags = koma.join( | |
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]' | |
'/a[@class="twitter-hashtag pretty-link js-nav"]').extract())) | |
item['hash_tags'] = remove_tags(hash_tags) | |
item['time_tweet'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/small[@class="time"]/a/@title').extract())) | |
item['lang'] = ''.join( | |
map(unicode.strip, sel.xpath('p[@class="TweetTextSize js-tweet-text tweet-text"]/@lang').extract())) | |
retweets = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]' | |
'/div[@class="ProfileTweet-actionList js-actions"]' | |
'/div[@class="ProfileTweet-action ProfileTweet-action--retweet js-toggleState js-toggleRt"]' | |
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionRetweet"]' | |
'/div[@class="IconTextContainer"]').extract())) | |
item['retweets'] = remove_tags(retweets).strip() | |
favorite = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-footer"]' | |
'/div[@class="ProfileTweet-actionList js-actions"]' | |
'/div[@class="ProfileTweet-action ProfileTweet-action--favorite js-toggleState"]' | |
'/button[@class="ProfileTweet-actionButton js-actionButton js-actionFavorite"]' | |
'/div[@class="IconTextContainer"]').extract())) | |
item['favorite'] = remove_tags(favorite).strip() | |
item['place_id'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/@data-place-id').extract())) | |
item['place'] = ''.join( | |
map(unicode.strip, sel.xpath('div[@class="stream-item-header"]/span[@class="Tweet-geo u-floatRight js-tooltip"]/a/span[@class="u-hiddenVisually"]/text()').extract())) | |
item['max_position'] = max_position | |
yield item |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment