Created
September 14, 2011 07:49
-
-
Save isomorphisms/1216058 to your computer and use it in GitHub Desktop.
grab the features out of tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
use warnings; use strict; | |
use boolean; | |
use feature 'say'; | |
#my $tweet = q{RT @peteyorn: @Starbucks Thanks for putting the Break Up album for sale in your stores. \\ It's a great album! Nice work!}; | |
#my $tweet = q{@kohlgreyson Yesterday's announcement was just a preview. We will update the twitter icon in the spring.}; | |
my $tweet = q{#StarbucksVIA is at Moonshine for the @Wired party for #SXSW. Two of our amazing designers are enjoying a VIAtini. http://yfrog.com/j59i1j}; | |
my @names = (); | |
my $restoftweet = ""; | |
my $originator = ""; #RT | |
my @hashtags = (); | |
my @links = (); | |
my $response = 0; #does the tweet begin with an "@"? alternative is "mention" | |
#my $w = "[:word:|:punct:]"; #Tweet Main-Text Regex | |
#usernames are only \w+ | |
#hash and at should be treated differently | |
my $w = '[^@#]'; #I *think* this works for all non-tweet non-hash text in a tweet | |
#look for retweet | |
if ($tweet =~ m/RT @(\w+)/) { | |
my $retweet = true ; | |
$originator = $1 ; | |
say length($originator); | |
say "originator is " . $originator; say "\n"; | |
$tweet =~ s/RT @($originator)//g; | |
} | |
say length($tweet); | |
say "tweet without originator is " . $tweet; say "\n"; | |
#strip punctuation if it's still at the front (e.g. RT @peteyorn: @Starbucks ...) | |
#if ($tweet =~ m/$[:punct:] /) { | |
# $tweet =~ s/$[:punct:] //; | |
# } | |
#pull out initial @ | |
if ($tweet =~ m/^@(\w+) (.+)$/) { | |
push @names, $1; | |
$tweet = $2; | |
$response = true; #starts with "@" | |
} | |
say length($tweet); | |
say "tweet without header \@ is " . $tweet; say "\n"; | |
#any links used | |
my $url = "http://\w+\.\w{2-3}/\w+"; #short url regex | |
while ($tweet =~ m#^(.*?)($url)(.*)$#) { | |
push @links, $2; | |
$tweet = $1 . $3; | |
} | |
say length($tweet); | |
say "tweet without links is " . $tweet; say "\n"; | |
say @links; | |
say "links are........"; | |
foreach (@links) { say $_; } | |
#any hashtags used | |
#pull out @'s and names from anywhere | |
while ($tweet =~ m/^($w+)@(\w+) (.+)$/) { | |
push @names, $2; | |
$tweet = $1 . $3; | |
} | |
say length($tweet); | |
say "tweet without any \@'s is " . $tweet; say "\n"; | |
say length(@names); #originator is not counted in @names | |
say "names are ...."; | |
foreach (@names) { say $_; } | |
#any emoticons used | |
#rest of text | |
say "Salutations distinguées."; | |
#Yuna, another choice is to pull out "RT @" and collect the name after. I didn't do that so we're treating "RT" as a word / feature. May be useful that way, I think. | |
#Another "feature" you could pull out: were they themselves mentioned in a retweet? | |
#starts with @ or RT @ | |
#yes/no | |
#has @ or RT @ in it anywhere | |
#yes/no | |
#number of times RT, number of times @ | |
#names mentioned in @ | |
# /@(\w.+) /g | |
# number of names in @names | |
# is self one of the names #probably won't be used more than once ... and wouldn't be relevat if it were | |
#does the text "RT" occur? |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment