Skip to content

Instantly share code, notes, and snippets.

@leomelzer
Created July 9, 2012 09:02
Show Gist options
  • Star 15 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save leomelzer/3075236 to your computer and use it in GitHub Desktop.
Save leomelzer/3075236 to your computer and use it in GitHub Desktop.
Simple-stupid Sentiment analysis for 1 million tweets.
package analyse;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
/*
* (Really simple-dumb) Sentiment analysis for a lucene index of 1 million Tweets!
* Based on http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
*
*/
public class Analyse {
// path to lucene index
private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
// path to language profiles for classifier
private static String langProfileDirectory = "./src/profiles/";
// lucene queryParser for saving
private static QueryParser queryParser;
// used to store positive and negative words for scoring
static List<String> posWords = new ArrayList<String>();
static List<String> negWords = new ArrayList<String>();
// keep some stats! [-1 / 0 / 1 / not english / foursquare / no text to
// classify]
static int[] stats = new int[6];
/**
* @param args
* @throws IOException
* @throws LangDetectException
*/
public static void main(String[] args) throws IOException,
LangDetectException {
// huh, how long?
long startTime = System.currentTimeMillis();
// open lucene index
Directory dir;
IndexReader docReader = null;
try {
dir = FSDirectory.open(new File(indexPath));
docReader = IndexReader.open(dir, true);
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("START: reading file list");
// source: www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
BufferedReader negReader = new BufferedReader(new FileReader(new File(
"./src/negative-words.txt")));
BufferedReader posReader = new BufferedReader(new FileReader(new File(
"./src/positive-words.txt")));
// currently read word
String word;
// add words to comparison list
while ((word = negReader.readLine()) != null) {
negWords.add(word);
}
while ((word = posReader.readLine()) != null) {
posWords.add(word);
}
// cleanup
negReader.close();
posReader.close();
System.out.println("FINISH: reading file list");
// ----------------------------------------------
System.out.println("START: calculating sentiment");
// prepare language classifier
DetectorFactory.loadProfile(langProfileDirectory);
// store different languages
Map<String, Integer> langHitList = new HashMap<String, Integer>();
// detect language, using http://code.google.com/p/language-detection/
// has 99% accuracy
Detector detector;
// current tweet
Document tweet;
// current score
int score = 0;
// current text
String text;
// maximum number of documents
int max = docReader.maxDoc();
// used to give some feedback during processing the 1 million tweets
int j = 0;
// do we want to skip saving that document?
boolean skipSave = false;
for (int i = 0; i < max; i++) { //
if (i % 100000 == 0) {
System.out.println("PROCESSING: " + j * 100000 + " of "
+ max + " tweets processed...");
j++;
}
// reset, most of the times we want that.
skipSave = false;
try {
// read it!
tweet = docReader.document(i);
text = tweet.get("text");
// we need a new instance every time unfortunately...
detector = DetectorFactory.create();
detector.append(text);
// classify language!
String detectedLanguage = detector.detect();
// if it is not english...
if (detectedLanguage.equals("en") == false) {
stats[3]++;
// we can't classify non-english tweets, so just keep them
// neutral
score = 0;
} else if (text.startsWith("I'm at")
|| text.startsWith("I just became the mayor")
|| text.startsWith("I just ousted")) {
// all your foursquare updates are belong to us.
stats[4]++;
// and we don't save them. yo.
skipSave = true;
} else {
// finally! retrieve sentiment score.
score = getSentimentScore(tweet.get("text"));
// ++ index so we won't have -1 and stuff...
stats[score + 1]++;
// wanna see what neutral tweets look like? uncomment.
// if (score == 0) {
// System.out.println("Score: " + score + " for Tweet (" +
// tweet.get("ID") + "):"+ tweet.get("text"));
// }
}
// so now for the saving...
if (skipSave == false) {
Integer currentCount = langHitList.get(detectedLanguage);
// ...save the detected language for some stats
langHitList.put(detectedLanguage,
(currentCount == null) ? 1 : currentCount + 1);
// tweet.set("language", detectedLanguage)
// tweet.set("sentiment", score);
// tweet.get("ID");
}
} catch (LangDetectException e) {
// thrown by the language classifier when tweets are like :D or
// :3 or ?????????
// count how many times there is no valid input, plus we won't
// save it as it's in the catch clause...
stats[5]++;
} catch (Exception e) {
// something went wrong, ouuups!
e.printStackTrace();
System.err.println("Doc at " + i + " does not exist");
}
}
System.out.println("FINISH: calculating sentiment");
// ----------------------------------------------
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
System.out.println("----------------------------------------------");
System.out.println("STATS - TIME: Analysis took "
+ TimeUnit.SECONDS.convert(totalTime, TimeUnit.MILLISECONDS)
+ " seconds");
// ----------------------------------------------
// get me some info!
System.out.println("STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]");
System.out.println("STATS - COUNTS: " + java.util.Arrays.toString(stats));
System.out.println("STATS - LANGUAGE: " + langHitList.toString());
// cleanup
docReader.close();
}
/**
* does some string mangling and then calculates occurrences in positive /
* negative word list and finally the delta
*
*
* @param input
* String: the text to classify
* @return score int: if < 0 then -1, if > 0 then 1 otherwise 0 - we don't
* care about the actual delta
*/
private static int getSentimentScore(String input) {
// normalize!
input = input.toLowerCase();
input = input.trim();
// remove all non alpha-numeric non whitespace chars
input = input.replaceAll("[^a-zA-Z0-9\\s]", "");
int negCounter = 0;
int posCounter = 0;
// so what we got?
String[] words = input.split(" ");
// check if the current word appears in our reference lists...
for (int i = 0; i < words.length; i++) {
if (posWords.contains(words[i])) {
posCounter++;
}
if (negWords.contains(words[i])) {
negCounter++;
}
}
// positive matches MINUS negative matches
int result = (posCounter - negCounter);
// negative?
if (result < 0) {
return -1;
// or positive?
} else if (result > 0) {
return 1;
}
// neutral to the rescue!
return 0;
}
}
START: reading file list
FINISH: reading file list
START: calculating sentiment
PROCESSING: 0 of 1057001 tweets processed...
PROCESSING: 100000 of 1057001 tweets processed...
PROCESSING: 200000 of 1057001 tweets processed...
PROCESSING: 300000 of 1057001 tweets processed...
PROCESSING: 400000 of 1057001 tweets processed...
PROCESSING: 500000 of 1057001 tweets processed...
PROCESSING: 600000 of 1057001 tweets processed...
PROCESSING: 700000 of 1057001 tweets processed...
PROCESSING: 800000 of 1057001 tweets processed...
PROCESSING: 900000 of 1057001 tweets processed...
PROCESSING: 1000000 of 1057001 tweets processed...
FINISH: calculating sentiment
----------------------------------------------
STATS - TIME: Analysis took 569 seconds
STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]
STATS - COUNTS: [89309, 248062, 130849, 560431, 23063, 5287]
STATS - LANGUAGE: {tl=12767, tr=14695, no=8690, th=3268, bn=671, fi=10503, ta=5, sv=6037, fr=19364, bg=454, sw=4527, sl=5516, sk=2467, da=5461, so=24462, sq=1955, ko=3151, he=92, cs=1186, kn=3, pa=1, pl=4483, ru=4920, hr=3802, ro=5077, vi=2981, hu=3411, lv=1713, lt=3106, hi=4, id=34223, de=17254, zh-tw=476, mk=238, uk=160, it=23536, zh-cn=761, ur=70, fa=297, ar=6145, el=1071, ne=2, pt=188253, en=468220, et=12870, es=84303, ja=9758, nl=14863, af=11379}
@jay-pk-codebook
Copy link

Hi, what s the format of your input tweet? how have you denoted your documents inside "/Users/leomelzer/Downloads/Tweets/"?

@rks0191
Copy link

rks0191 commented Dec 12, 2013

May not work fine for all the tweets. Suppose iam having something like "the product is too good to be ignored" and in your "./src/negative-words.txt" file you are having a word called "ignored" and in the file "./src/positive-words.txt" you are having "good". In this case the result will be zero and hence the statement will be deemed as neutal but actually speaking the review is truely positive

@ameyjadiye
Copy link

@rks0191 , well that why the header of program is "Simple-stupid Sentiment analysis" 😉

@vnkmr7620
Copy link

what is private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
wat this path contains? i tried but i am getting following error

START: reading file list
org.apache.lucene.index.IndexNotFoundException: no segments* file found in org.apache.lucene.store.SimpleFSDirectory@C:\vinay\apache-lucene\org\apache\lucene\index lockFactory=org.apache.lucene.store.NativeFSLockFactory@ed1f14: files: [BufferedDeletes$Num.class, BufferedDeletes.class, ByteBlockPool$Allocator.class, ByteBlockPool.class, ByteSliceReader.class, ByteSliceWriter.class, CharBlockPool.class, CheckIndex$MySegmentTermDocs.class, CheckIndex$Status$SegmentInfoStatus.class, CheckIndex$Status.class, CheckIndex.class, CompoundFileReader$1.class, CompoundFileReader$CSIndexInput.class, CompoundFileReader$FileEntry.class, CompoundFileReader.class, CompoundFileWriter$1.class, CompoundFileWriter$FileEntry.class, CompoundFileWriter.class, ConcurrentMergeScheduler$MergeThread.class, ConcurrentMergeScheduler.class, CorruptIndexException.class, DefaultSkipListReader.class, DefaultSkipListWriter.class, DirectoryIndexReader$1.class, DirectoryIndexReader$2.class, DirectoryIndexReader$ReaderCommit.class, DirectoryIndexReader.class, DocConsumer.class, DocConsumerPerThread.class, DocFieldConsumer.class, DocFieldConsumerPerField.class, DocFieldConsumerPerThread.class, DocFieldConsumers$PerDoc.class, DocFieldConsumers.class, DocFieldConsumersPerField.class, DocFieldConsumersPerThread.class, DocFieldProcessor.class, DocFieldProcessorPerField.class, DocFieldProcessorPerThread.class, DocInverter$FieldInvertState.class, DocInverter.class, DocInverterPerField.class, DocInverterPerThread.class, DocumentsWriter$1.class, DocumentsWriter$ByteBlockAllocator.class, DocumentsWriter$DocState.class, DocumentsWriter$DocWriter.class, DocumentsWriter$FlushState.class, DocumentsWriter$SkipDocWriter.class, DocumentsWriter$WaitQueue.class, DocumentsWriter.class, DocumentsWriterThreadState.class, FieldInfo.class, FieldInfos.class, FieldReaderException.class, FieldSortedTermVectorMapper.class, FieldsReader$FieldForMerge.class, FieldsReader$LazyField.class, FieldsReader.class, FieldsWriter.class, FilterIndexReader$FilterTermDocs.class, FilterIndexReader$FilterTermEnum.class, FilterIndexReader$FilterTermPositions.class, FilterIndexReader.class, FreqProxFieldMergeState.class, FreqProxTermsWriter$PostingList.class, FreqProxTermsWriter.class, FreqProxTermsWriterPerField.class, FreqProxTermsWriterPerThread.class, IndexCommit.class, IndexCommitPoint.class, IndexDeletionPolicy.class, IndexFileDeleter$1.class, IndexFileDeleter$CommitPoint.class, IndexFileDeleter$RefCount.class, IndexFileDeleter.class, IndexFileNameFilter.class, IndexFileNames.class, IndexModifier.class, IndexReader$1.class, IndexReader$2.class, IndexReader$FieldOption.class, IndexReader.class, IndexWriter$MaxFieldLength.class, IndexWriter.class, IntBlockPool.class, InvertedDocConsumer.class, InvertedDocConsumerPerField.class, InvertedDocConsumerPerThread.class, InvertedDocEndConsumer.class, InvertedDocEndConsumerPerField.class, InvertedDocEndConsumerPerThread.class, KeepOnlyLastCommitDeletionPolicy.class, LogByteSizeMergePolicy.class, LogDocMergePolicy.class, LogMergePolicy.class, MergeDocIDRemapper.class, MergePolicy$MergeAbortedException.class, MergePolicy$MergeException.class, MergePolicy$MergeSpecification.class, MergePolicy$OneMerge.class, MergePolicy.class, MergeScheduler.class, MultiLevelSkipListReader$SkipBuffer.class, MultiLevelSkipListReader.class, MultiLevelSkipListWriter.class, MultipleTermPositions$1.class, MultipleTermPositions$IntQueue.class, MultipleTermPositions$TermPositionsQueue.class, MultipleTermPositions.class, MultiReader.class, MultiSegmentReader$MultiTermDocs.class, MultiSegmentReader$MultiTermEnum.class, MultiSegmentReader$MultiTermPositions.class, MultiSegmentReader.class, NormsWriter.class, NormsWriterPerField.class, NormsWriterPerThread.class, ParallelArrayTermVectorMapper.class, ParallelReader$ParallelTermDocs.class, ParallelReader$ParallelTermEnum.class, ParallelReader$ParallelTermPositions.class, ParallelReader.class, Payload.class, PositionBasedTermVectorMapper$TVPositionInfo.class, PositionBasedTermVectorMapper.class, RawPostingList.class, ReadOnlyMultiSegmentReader.class, ReadOnlySegmentReader.class, ReusableStringReader.class, SegmentInfo.class, SegmentInfos$1.class, SegmentInfos$2.class, SegmentInfos$FindSegmentsFile.class, SegmentInfos.class, SegmentMergeInfo.class, SegmentMergeQueue.class, SegmentMerger$1.class, SegmentMerger$CheckAbort.class, SegmentMerger.class, SegmentReader$Norm.class, SegmentReader.class, SegmentTermDocs.class, SegmentTermEnum.class, SegmentTermPositions.class, SegmentTermPositionVector.class, SegmentTermVector.class, SerialMergeScheduler.class, SnapshotDeletionPolicy$MyCommitPoint.class, SnapshotDeletionPolicy.class, SortedTermVectorMapper.class, StaleReaderException.class, StoredFieldsWriter$PerDoc.class, StoredFieldsWriter.class, StoredFieldsWriterPerField.class, StoredFieldsWriterPerThread.class, Term.class, TermBuffer.class, TermDocs.class, TermEnum.class, TermFreqVector.class, TermInfo.class, TermInfosReader$1.class, TermInfosReader$ThreadResources.class, TermInfosReader.class, TermInfosWriter.class, TermPositions.class, TermPositionVector.class, TermsHash.class, TermsHashConsumer.class, TermsHashConsumerPerField.class, TermsHashConsumerPerThread.class, TermsHashPerField.class, TermsHashPerThread.class, TermVectorEntry.class, TermVectorEntryFreqSortedComparator.class, TermVectorMapper.class, TermVectorOffsetInfo.class, TermVectorsReader.class, TermVectorsTermsWriter$PerDoc.class, TermVectorsTermsWriter$PostingList.class, TermVectorsTermsWriter.class, TermVectorsTermsWriterPerField.class, TermVectorsTermsWriterPerThread.class, TermVectorsWriter.class]
at org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:741)
at org.apache.lucene.index.StandardDirectoryReader.open(StandardDirectoryReader.java:52)
at org.apache.lucene.index.DirectoryReader.open(DirectoryReader.java:65)
at org.apache.lucene.index.IndexReader.open(IndexReader.java:291)
at vinay.Analyse.main(Analyse.java:57)
FINISH: reading file list
START: calculating sentiment
Exception in thread "main" java.lang.NoClassDefFoundError: net/arnx/jsonic/JSONException
at vinay.Analyse.main(Analyse.java:91)
Caused by: java.lang.ClassNotFoundException: net.arnx.jsonic.JSONException
at java.net.URLClassLoader$1.run(Unknown Source)
at java.net.URLClassLoader$1.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
... 1 more

@sudhir-govekar
Copy link

hello sir,
i m new to Sentiment analysis can any one help me out for +ve and -ve word list ..?
i need to detect sarcasm and non sarcasm from twitter data

Thank you in Advance...

@BuildSuccessful
Copy link

Can you please attach the sample files being used in the code?

@Bhagyasree1234
Copy link

I am very new to this sentimental analysis.So,can u please share input file..it will be more helpful to me.Thanks in Advance

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment