Skip to content

Instantly share code, notes, and snippets.

@leomelzer
Created July 9, 2012 09:02
Show Gist options
  • Star 15 You must be signed in to star a gist
  • Fork 8 You must be signed in to fork a gist
  • Save leomelzer/3075236 to your computer and use it in GitHub Desktop.
Save leomelzer/3075236 to your computer and use it in GitHub Desktop.
Simple-stupid Sentiment analysis for 1 million tweets.
package analyse;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
/*
* (Really simple-dumb) Sentiment analysis for a lucene index of 1 million Tweets!
* Based on http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
*
*/
public class Analyse {
// path to lucene index
private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
// path to language profiles for classifier
private static String langProfileDirectory = "./src/profiles/";
// lucene queryParser for saving
private static QueryParser queryParser;
// used to store positive and negative words for scoring
static List<String> posWords = new ArrayList<String>();
static List<String> negWords = new ArrayList<String>();
// keep some stats! [-1 / 0 / 1 / not english / foursquare / no text to
// classify]
static int[] stats = new int[6];
/**
* @param args
* @throws IOException
* @throws LangDetectException
*/
public static void main(String[] args) throws IOException,
LangDetectException {
// huh, how long?
long startTime = System.currentTimeMillis();
// open lucene index
Directory dir;
IndexReader docReader = null;
try {
dir = FSDirectory.open(new File(indexPath));
docReader = IndexReader.open(dir, true);
} catch (IOException e1) {
e1.printStackTrace();
}
System.out.println("START: reading file list");
// source: www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
BufferedReader negReader = new BufferedReader(new FileReader(new File(
"./src/negative-words.txt")));
BufferedReader posReader = new BufferedReader(new FileReader(new File(
"./src/positive-words.txt")));
// currently read word
String word;
// add words to comparison list
while ((word = negReader.readLine()) != null) {
negWords.add(word);
}
while ((word = posReader.readLine()) != null) {
posWords.add(word);
}
// cleanup
negReader.close();
posReader.close();
System.out.println("FINISH: reading file list");
// ----------------------------------------------
System.out.println("START: calculating sentiment");
// prepare language classifier
DetectorFactory.loadProfile(langProfileDirectory);
// store different languages
Map<String, Integer> langHitList = new HashMap<String, Integer>();
// detect language, using http://code.google.com/p/language-detection/
// has 99% accuracy
Detector detector;
// current tweet
Document tweet;
// current score
int score = 0;
// current text
String text;
// maximum number of documents
int max = docReader.maxDoc();
// used to give some feedback during processing the 1 million tweets
int j = 0;
// do we want to skip saving that document?
boolean skipSave = false;
for (int i = 0; i < max; i++) { //
if (i % 100000 == 0) {
System.out.println("PROCESSING: " + j * 100000 + " of "
+ max + " tweets processed...");
j++;
}
// reset, most of the times we want that.
skipSave = false;
try {
// read it!
tweet = docReader.document(i);
text = tweet.get("text");
// we need a new instance every time unfortunately...
detector = DetectorFactory.create();
detector.append(text);
// classify language!
String detectedLanguage = detector.detect();
// if it is not english...
if (detectedLanguage.equals("en") == false) {
stats[3]++;
// we can't classify non-english tweets, so just keep them
// neutral
score = 0;
} else if (text.startsWith("I'm at")
|| text.startsWith("I just became the mayor")
|| text.startsWith("I just ousted")) {
// all your foursquare updates are belong to us.
stats[4]++;
// and we don't save them. yo.
skipSave = true;
} else {
// finally! retrieve sentiment score.
score = getSentimentScore(tweet.get("text"));
// ++ index so we won't have -1 and stuff...
stats[score + 1]++;
// wanna see what neutral tweets look like? uncomment.
// if (score == 0) {
// System.out.println("Score: " + score + " for Tweet (" +
// tweet.get("ID") + "):"+ tweet.get("text"));
// }
}
// so now for the saving...
if (skipSave == false) {
Integer currentCount = langHitList.get(detectedLanguage);
// ...save the detected language for some stats
langHitList.put(detectedLanguage,
(currentCount == null) ? 1 : currentCount + 1);
// tweet.set("language", detectedLanguage)
// tweet.set("sentiment", score);
// tweet.get("ID");
}
} catch (LangDetectException e) {
// thrown by the language classifier when tweets are like :D or
// :3 or ?????????
// count how many times there is no valid input, plus we won't
// save it as it's in the catch clause...
stats[5]++;
} catch (Exception e) {
// something went wrong, ouuups!
e.printStackTrace();
System.err.println("Doc at " + i + " does not exist");
}
}
System.out.println("FINISH: calculating sentiment");
// ----------------------------------------------
long endTime = System.currentTimeMillis();
long totalTime = endTime - startTime;
System.out.println("----------------------------------------------");
System.out.println("STATS - TIME: Analysis took "
+ TimeUnit.SECONDS.convert(totalTime, TimeUnit.MILLISECONDS)
+ " seconds");
// ----------------------------------------------
// get me some info!
System.out.println("STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]");
System.out.println("STATS - COUNTS: " + java.util.Arrays.toString(stats));
System.out.println("STATS - LANGUAGE: " + langHitList.toString());
// cleanup
docReader.close();
}
/**
* does some string mangling and then calculates occurrences in positive /
* negative word list and finally the delta
*
*
* @param input
* String: the text to classify
* @return score int: if < 0 then -1, if > 0 then 1 otherwise 0 - we don't
* care about the actual delta
*/
private static int getSentimentScore(String input) {
// normalize!
input = input.toLowerCase();
input = input.trim();
// remove all non alpha-numeric non whitespace chars
input = input.replaceAll("[^a-zA-Z0-9\\s]", "");
int negCounter = 0;
int posCounter = 0;
// so what we got?
String[] words = input.split(" ");
// check if the current word appears in our reference lists...
for (int i = 0; i < words.length; i++) {
if (posWords.contains(words[i])) {
posCounter++;
}
if (negWords.contains(words[i])) {
negCounter++;
}
}
// positive matches MINUS negative matches
int result = (posCounter - negCounter);
// negative?
if (result < 0) {
return -1;
// or positive?
} else if (result > 0) {
return 1;
}
// neutral to the rescue!
return 0;
}
}
START: reading file list
FINISH: reading file list
START: calculating sentiment
PROCESSING: 0 of 1057001 tweets processed...
PROCESSING: 100000 of 1057001 tweets processed...
PROCESSING: 200000 of 1057001 tweets processed...
PROCESSING: 300000 of 1057001 tweets processed...
PROCESSING: 400000 of 1057001 tweets processed...
PROCESSING: 500000 of 1057001 tweets processed...
PROCESSING: 600000 of 1057001 tweets processed...
PROCESSING: 700000 of 1057001 tweets processed...
PROCESSING: 800000 of 1057001 tweets processed...
PROCESSING: 900000 of 1057001 tweets processed...
PROCESSING: 1000000 of 1057001 tweets processed...
FINISH: calculating sentiment
----------------------------------------------
STATS - TIME: Analysis took 569 seconds
STATS - COUNTS: [negative | neutral | positive | not english | foursquare | no text to classify]
STATS - COUNTS: [89309, 248062, 130849, 560431, 23063, 5287]
STATS - LANGUAGE: {tl=12767, tr=14695, no=8690, th=3268, bn=671, fi=10503, ta=5, sv=6037, fr=19364, bg=454, sw=4527, sl=5516, sk=2467, da=5461, so=24462, sq=1955, ko=3151, he=92, cs=1186, kn=3, pa=1, pl=4483, ru=4920, hr=3802, ro=5077, vi=2981, hu=3411, lv=1713, lt=3106, hi=4, id=34223, de=17254, zh-tw=476, mk=238, uk=160, it=23536, zh-cn=761, ur=70, fa=297, ar=6145, el=1071, ne=2, pt=188253, en=468220, et=12870, es=84303, ja=9758, nl=14863, af=11379}
@vnkmr7620
Copy link

what is private final static String indexPath = "/Users/leomelzer/Downloads/Tweets/";
wat this path contains? i tried but i am getting following error

START: reading file list
org.apache.lucene.index.IndexNotFoundException: no segments* file found in org.apache.lucene.store.SimpleFSDirectory@C:\vinay\apache-lucene\org\apache\lucene\index lockFactory=org.apache.lucene.store.NativeFSLockFactory@ed1f14: files: [BufferedDeletes$Num.class, BufferedDeletes.class, ByteBlockPool$Allocator.class, ByteBlockPool.class, ByteSliceReader.class, ByteSliceWriter.class, CharBlockPool.class, CheckIndex$MySegmentTermDocs.class, CheckIndex$Status$SegmentInfoStatus.class, CheckIndex$Status.class, CheckIndex.class, CompoundFileReader$1.class, CompoundFileReader$CSIndexInput.class, CompoundFileReader$FileEntry.class, CompoundFileReader.class, CompoundFileWriter$1.class, CompoundFileWriter$FileEntry.class, CompoundFileWriter.class, ConcurrentMergeScheduler$MergeThread.class, ConcurrentMergeScheduler.class, CorruptIndexException.class, DefaultSkipListReader.class, DefaultSkipListWriter.class, DirectoryIndexReader$1.class, DirectoryIndexReader$2.class, DirectoryIndexReader$ReaderCommit.class, DirectoryIndexReader.class, DocConsumer.class, DocConsumerPerThread.class, DocFieldConsumer.class, DocFieldConsumerPerField.class, DocFieldConsumerPerThread.class, DocFieldConsumers$PerDoc.class, DocFieldConsumers.class, DocFieldConsumersPerField.class, DocFieldConsumersPerThread.class, DocFieldProcessor.class, DocFieldProcessorPerField.class, DocFieldProcessorPerThread.class, DocInverter$FieldInvertState.class, DocInverter.class, DocInverterPerField.class, DocInverterPerThread.class, DocumentsWriter$1.class, DocumentsWriter$ByteBlockAllocator.class, DocumentsWriter$DocState.class, DocumentsWriter$DocWriter.class, DocumentsWriter$FlushState.class, DocumentsWriter$SkipDocWriter.class, DocumentsWriter$WaitQueue.class, DocumentsWriter.class, DocumentsWriterThreadState.class, FieldInfo.class, FieldInfos.class, FieldReaderException.class, FieldSortedTermVectorMapper.class, FieldsReader$FieldForMerge.class, FieldsReader$LazyField.class, FieldsReader.class, FieldsWriter.class, FilterIndexReader$FilterTermDocs.class, FilterIndexReader$FilterTermEnum.class, FilterIndexReader$FilterTermPositions.class, FilterIndexReader.class, FreqProxFieldMergeState.class, FreqProxTermsWriter$PostingList.class, FreqProxTermsWriter.class, FreqProxTermsWriterPerField.class, FreqProxTermsWriterPerThread.class, IndexCommit.class, IndexCommitPoint.class, IndexDeletionPolicy.class, IndexFileDeleter$1.class, IndexFileDeleter$CommitPoint.class, IndexFileDeleter$RefCount.class, IndexFileDeleter.class, IndexFileNameFilter.class, IndexFileNames.class, IndexModifier.class, IndexReader$1.class, IndexReader$2.class, IndexReader$FieldOption.class, IndexReader.class, IndexWriter$MaxFieldLength.class, IndexWriter.class, IntBlockPool.class, InvertedDocConsumer.class, InvertedDocConsumerPerField.class, InvertedDocConsumerPerThread.class, InvertedDocEndConsumer.class, InvertedDocEndConsumerPerField.class, InvertedDocEndConsumerPerThread.class, KeepOnlyLastCommitDeletionPolicy.class, LogByteSizeMergePolicy.class, LogDocMergePolicy.class, LogMergePolicy.class, MergeDocIDRemapper.class, MergePolicy$MergeAbortedException.class, MergePolicy$MergeException.class, MergePolicy$MergeSpecification.class, MergePolicy$OneMerge.class, MergePolicy.class, MergeScheduler.class, MultiLevelSkipListReader$SkipBuffer.class, MultiLevelSkipListReader.class, MultiLevelSkipListWriter.class, MultipleTermPositions$1.class, MultipleTermPositions$IntQueue.class, MultipleTermPositions$TermPositionsQueue.class, MultipleTermPositions.class, MultiReader.class, MultiSegmentReader$MultiTermDocs.class, MultiSegmentReader$MultiTermEnum.class, MultiSegmentReader$MultiTermPositions.class, MultiSegmentReader.class, NormsWriter.class, NormsWriterPerField.class, NormsWriterPerThread.class, ParallelArrayTermVectorMapper.class, ParallelReader$ParallelTermDocs.class, ParallelReader$ParallelTermEnum.class, ParallelReader$ParallelTermPositions.class, ParallelReader.class, Payload.class, PositionBasedTermVectorMapper$TVPositionInfo.class, PositionBasedTermVectorMapper.class, RawPostingList.class, ReadOnlyMultiSegmentReader.class, ReadOnlySegmentReader.class, ReusableStringReader.class, SegmentInfo.class, SegmentInfos$1.class, SegmentInfos$2.class, SegmentInfos$FindSegmentsFile.class, SegmentInfos.class, SegmentMergeInfo.class, SegmentMergeQueue.class, SegmentMerger$1.class, SegmentMerger$CheckAbort.class, SegmentMerger.class, SegmentReader$Norm.class, SegmentReader.class, SegmentTermDocs.class, SegmentTermEnum.class, SegmentTermPositions.class, SegmentTermPositionVector.class, SegmentTermVector.class, SerialMergeScheduler.class, SnapshotDeletionPolicy$MyCommitPoint.class, SnapshotDeletionPolicy.class, SortedTermVectorMapper.class, StaleReaderException.class, StoredFieldsWriter$PerDoc.class, StoredFieldsWriter.class, StoredFieldsWriterPerField.class, StoredFieldsWriterPerThread.class, Term.class, TermBuffer.class, TermDocs.class, TermEnum.class, TermFreqVector.class, TermInfo.class, TermInfosReader$1.class, TermInfosReader$ThreadResources.class, TermInfosReader.class, TermInfosWriter.class, TermPositions.class, TermPositionVector.class, TermsHash.class, TermsHashConsumer.class, TermsHashConsumerPerField.class, TermsHashConsumerPerThread.class, TermsHashPerField.class, TermsHashPerThread.class, TermVectorEntry.class, TermVectorEntryFreqSortedComparator.class, TermVectorMapper.class, TermVectorOffsetInfo.class, TermVectorsReader.class, TermVectorsTermsWriter$PerDoc.class, TermVectorsTermsWriter$PostingList.class, TermVectorsTermsWriter.class, TermVectorsTermsWriterPerField.class, TermVectorsTermsWriterPerThread.class, TermVectorsWriter.class]
at org.apache.lucene.index.SegmentInfos$FindSegmentsFile.run(SegmentInfos.java:741)
at org.apache.lucene.index.StandardDirectoryReader.open(StandardDirectoryReader.java:52)
at org.apache.lucene.index.DirectoryReader.open(DirectoryReader.java:65)
at org.apache.lucene.index.IndexReader.open(IndexReader.java:291)
at vinay.Analyse.main(Analyse.java:57)
FINISH: reading file list
START: calculating sentiment
Exception in thread "main" java.lang.NoClassDefFoundError: net/arnx/jsonic/JSONException
at vinay.Analyse.main(Analyse.java:91)
Caused by: java.lang.ClassNotFoundException: net.arnx.jsonic.JSONException
at java.net.URLClassLoader$1.run(Unknown Source)
at java.net.URLClassLoader$1.run(Unknown Source)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
at sun.misc.Launcher$AppClassLoader.loadClass(Unknown Source)
at java.lang.ClassLoader.loadClass(Unknown Source)
... 1 more

@sudhir-govekar
Copy link

hello sir,
i m new to Sentiment analysis can any one help me out for +ve and -ve word list ..?
i need to detect sarcasm and non sarcasm from twitter data

Thank you in Advance...

@BuildSuccessful
Copy link

Can you please attach the sample files being used in the code?

@Bhagyasree1234
Copy link

I am very new to this sentimental analysis.So,can u please share input file..it will be more helpful to me.Thanks in Advance

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment