isomorphisms/tweetstrip.perl

## tweetstrip.perl
#!/usr/bin/perl
use warnings;	use strict;
use boolean;
use feature 'say';


#my $tweet = q{RT @peteyorn: @Starbucks Thanks for putting the Break Up album for sale in your stores. \\ It's a great album! Nice work!};
#my $tweet = q{@kohlgreyson Yesterday's announcement was just a preview. We will update the twitter icon in the spring.};
my $tweet = q{#StarbucksVIA is at Moonshine for the @Wired party for #SXSW. Two of our amazing designers are enjoying a VIAtini. http://yfrog.com/j59i1j};


my @names = ();
my $restoftweet = "";
my $originator = "";	#RT
my @hashtags = ();
my @links = ();

my $response = 0;	#does the tweet begin with an "@"? alternative is "mention"


#my $w = "[:word:|:punct:]";	#Tweet Main-Text Regex
			#usernames are only \w+
			#hash and at should be treated differently
my $w = '[^@#]';	#I *think* this works for all non-tweet non-hash text in a tweet


#look for retweet
if ($tweet =~ m/RT @(\w+)/) {
	my $retweet = true ;
	$originator = $1 ;
say length($originator);
say "originator is " . $originator; say "\n";

	$tweet =~ s/RT @($originator)//g;

	}
say length($tweet);
say "tweet without originator is " . $tweet; say "\n";

#strip punctuation if it's still at the front (e.g. RT @peteyorn: @Starbucks ...)
#if ($tweet =~ m/$[:punct:] /) {
#	$tweet =~ s/$[:punct:] //;
#	}


#pull out initial @
if ($tweet =~ m/^@(\w+) (.+)$/) {
	push @names, $1;
	$tweet = $2;
	$response = true;	#starts with "@"
	}
say length($tweet);
say "tweet without header \@ is " . $tweet; say "\n";


#any links used
my $url = "http://\w+\.\w{2-3}/\w+";		#short url regex

while ($tweet =~ m#^(.*?)($url)(.*)$#) {
	push @links, $2;
	$tweet = $1 . $3;
	}


say length($tweet);
say "tweet without links is " . $tweet; say "\n";


say @links;
say "links are........";
foreach (@links) { say $_; }


#any hashtags used


#pull out @'s and names from anywhere
while ($tweet =~ m/^($w+)@(\w+) (.+)$/) {
	push @names, $2;
	$tweet = $1 . $3;
	}

say length($tweet);
say "tweet without any \@'s is " . $tweet; say "\n";

say length(@names);	#originator is not counted in @names
say "names are ....";
foreach (@names) { say $_; }


#any emoticons used


#rest of text


say "Salutations distinguées.";


#Yuna, another choice is to pull out "RT @" and collect the name after. I didn't do that so we're treating "RT" as a word / feature. May be useful that way, I think.
#Another "feature" you could pull out: were they themselves mentioned in a retweet?


#starts with @ or RT @
#yes/no


#has @ or RT @ in it anywhere
#yes/no
#number of times RT, number of times @


#names mentioned in @
# /@(\w.+) /g


# number of names in @names
# is self one of the names	#probably won't be used more than once ... and wouldn't be relevat if it were


#does the text "RT" occur?
	#!/usr/bin/perl
	use warnings; use strict;
	use boolean;
	use feature 'say';


	#my $tweet = q{RT @peteyorn: @Starbucks Thanks for putting the Break Up album for sale in your stores. \\ It's a great album! Nice work!};
	#my $tweet = q{@kohlgreyson Yesterday's announcement was just a preview. We will update the twitter icon in the spring.};
	my $tweet = q{#StarbucksVIA is at Moonshine for the @Wired party for #SXSW. Two of our amazing designers are enjoying a VIAtini. http://yfrog.com/j59i1j};



	my @names = ();
	my $restoftweet = "";
	my $originator = ""; #RT
	my @hashtags = ();
	my @links = ();

	my $response = 0; #does the tweet begin with an "@"? alternative is "mention"


	#my $w = "[:word:\|:punct:]"; #Tweet Main-Text Regex
	#usernames are only \w+
	#hash and at should be treated differently
	my $w = '[^@#]'; #I think this works for all non-tweet non-hash text in a tweet









	#look for retweet
	if ($tweet =~ m/RT @(\w+)/) {
	my $retweet = true ;
	$originator = $1 ;
	say length($originator);
	say "originator is " . $originator; say "\n";

	$tweet =~ s/RT @($originator)//g;

	}
	say length($tweet);
	say "tweet without originator is " . $tweet; say "\n";

	#strip punctuation if it's still at the front (e.g. RT @peteyorn: @Starbucks ...)
	#if ($tweet =~ m/$[:punct:] /) {
	# $tweet =~ s/$[:punct:] //;
	# }



	#pull out initial @
	if ($tweet =~ m/^@(\w+) (.+)$/) {
	push @names, $1;
	$tweet = $2;
	$response = true; #starts with "@"
	}
	say length($tweet);
	say "tweet without header \@ is " . $tweet; say "\n";






	#any links used
	my $url = "http://\w+\.\w{2-3}/\w+"; #short url regex

	while ($tweet =~ m#^(.?)($url)(.)$#) {
	push @links, $2;
	$tweet = $1 . $3;
	}


	say length($tweet);
	say "tweet without links is " . $tweet; say "\n";


	say @links;
	say "links are........";
	foreach (@links) { say $_; }



	#any hashtags used






	#pull out @'s and names from anywhere
	while ($tweet =~ m/^($w+)@(\w+) (.+)$/) {
	push @names, $2;
	$tweet = $1 . $3;
	}

	say length($tweet);
	say "tweet without any \@'s is " . $tweet; say "\n";

	say length(@names); #originator is not counted in @names
	say "names are ....";
	foreach (@names) { say $_; }









	#any emoticons used



	#rest of text







	say "Salutations distinguées.";





	#Yuna, another choice is to pull out "RT @" and collect the name after. I didn't do that so we're treating "RT" as a word / feature. May be useful that way, I think.
	#Another "feature" you could pull out: were they themselves mentioned in a retweet?



	#starts with @ or RT @
	#yes/no


	#has @ or RT @ in it anywhere
	#yes/no
	#number of times RT, number of times @


	#names mentioned in @
	# /@(\w.+) /g



	# number of names in @names
	# is self one of the names #probably won't be used more than once ... and wouldn't be relevat if it were


	#does the text "RT" occur?