Last active
May 16, 2019 10:04
-
-
Save anup50695/e79c492da3115bbfabfe980d63a02016 to your computer and use it in GitHub Desktop.
Key Phrase Extraction from Tweets
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
extractChunks <- function(x) { | |
x <- as.String(x) | |
wordAnnotation <- annotate(x, list(Maxent_Sent_Token_Annotator(), Maxent_Word_Token_Annotator())) | |
POSAnnotation <- annotate(x, Maxent_POS_Tag_Annotator(), wordAnnotation) | |
POSwords <- subset(POSAnnotation, type == "word") | |
tags <- sapply(POSwords$features, '[[', "POS") | |
tokenizedAndTagged <- data.frame(Tokens = x[POSwords], Tags = tags) | |
tokenizedAndTagged$Tags_mod = grepl("NN|JJ", tokenizedAndTagged$Tags) | |
chunk = vector() | |
chunk[1] = as.numeric(tokenizedAndTagged$Tags_mod[1]) | |
for (i in 2:nrow(tokenizedAndTagged)) { | |
if(!tokenizedAndTagged$Tags_mod[i]) { | |
chunk[i] = 0 | |
} else if (tokenizedAndTagged$Tags_mod[i] == tokenizedAndTagged$Tags_mod[i-1]) { | |
chunk[i] = chunk[i-1] | |
} else { | |
chunk[i] = max(chunk) + 1 | |
} | |
} | |
text_chunk <- split(as.character(tokenizedAndTagged$Tokens), chunk) | |
tag_pattern <- split(as.character(tokenizedAndTagged$Tags), chunk) | |
names(text_chunk) <- sapply(tag_pattern, function(x) paste(x, collapse = "-")) | |
# Extract chunks matching pattern | |
res = text_chunk[grepl("JJ-NN|NN.-NN", names(text_chunk))] | |
res = sapply(res, function(x) paste(x, collapse = " ")) | |
return(res) | |
gc() | |
} |
How to show the output of the code?
Its just simply running I dont get from where it is getting the tweets
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
requirements:
install.packages("NLP")
library("NLP")
install.packages("openNLP")
library("openNLP")