-
-
Save CateGitau/3eac49225636ffdd7cc9268f4f1c94c6 to your computer and use it in GitHub Desktop.
options(java.parameters = "- Xmx1024m") | |
#load libraries | |
library(openxlsx) | |
library(rJava) | |
library(NLP) | |
library(openNLP) | |
library(RWeka) | |
#read text | |
text <- c("My name is Catherine Gitau, I work at Ongair Limited in Nairobi, Kenya") | |
#convert the character vectors into one character vector | |
text <- paste(text, collapse = " ") | |
print(text) | |
#converts bio variable into a string | |
text<- as.String(text) | |
#create annotators for words and sentences | |
word_ann <- Maxent_Word_Token_Annotator() | |
sent_ann <- Maxent_Sent_Token_Annotator() | |
#Identifies where the sentences are and the words | |
text_annotations <- annotate(text, list(sent_ann, word_ann)) | |
head(text_annotations) | |
#combines bio and the annotations | |
text_doc <- AnnotatedPlainTextDocument(text, text_annotations) | |
words(text_doc) | |
#creates annotators of kind person, location and organization | |
person_ann <- Maxent_Entity_Annotator(kind = "person") | |
location_ann <- Maxent_Entity_Annotator(kind = "location") | |
organization_ann <- Maxent_Entity_Annotator(kind = "organization") | |
#holds annotators in the order to be applied | |
pipeline <- list(sent_ann, | |
word_ann, | |
person_ann, | |
location_ann, | |
organization_ann) | |
text_annotations <- annotate(text, pipeline) | |
text_doc <- AnnotatedPlainTextDocument(text, text_annotations) | |
# Extract entities from an AnnotatedPlainTextDocument | |
entities <- function(text, kind) { | |
s <- text$content | |
a <- annotations(text)[[1]] | |
if(hasArg(kind)) { | |
k <- sapply(a$features, `[[`, "kind") | |
s[a[k == kind]] | |
} else { | |
s[a[a$type == "entity"]] | |
} | |
} | |
entities(text_doc, kind = "person") | |
Hi Catherine, thank you for the code you presented above. It helps a lot. However, when I run the code, I obtained the following results
entities(text_doc, kind = "person")
character(0)
entities(text_doc, kind = "location")
character(0)
entities(text_doc, kind = "organization")
character(0)
In the annotations, see the following example results, I hope it worked well
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)
words(text_doc)
[1] "c("" "Together"
[3] "we" "can"
[5] "stop" "Floods"
[7] "in" "#"
[9] "Dar_es_salaam" "https"
[11] "://t.co/MRTeSXYghq" "."
[13] "Let" "Stop"
[15] "Floods" "from"
[17] "our\ncommunities" "in"
[19] "#" "Dar_es_salaam"
[21] "https" "://t.co/LghNMt1C1v\n\nEarly"
[23] "Today" "."
[25] "Community" "engagement"
[27] "in" "Project"
Where do you think is the problem for the final result to be as character(0)?
I am working on a twitter data and would like to get place names and nouns mentioned in the tweets so that I can geocode the places.
Thank you for helpiing
For those looking the entities function does not work because annotations was deprecated, found the solution on a YouTube comment so thought I'd cross post for those searching. You will need to install and load the coreNLP library, update annotations to read annotation and remove the brackets after the document, this should then recognize and output the entities identified when running the function with a kind specified.
Install.packages("coreNLP")
library(coreNLP)
entities <- function(doc, kind) {
s <- doc$content
a <- annotation(doc)
if(hasArg(kind)) {
k <- sapply(a$features, [[
, "kind")
s[a[k == kind]]
} else {
s[a[a$type == "entity"]]
}
}
Change annotations to annotation.
a <- annotations(text)[[1]]
a <- annotation(text)[[1]]
Ref: https://www.rdocumentation.org/packages/NLP/versions/0.2-1/topics/AnnotatedPlainTextDocument