-
-
Save CateGitau/3eac49225636ffdd7cc9268f4f1c94c6 to your computer and use it in GitHub Desktop.
options(java.parameters = "- Xmx1024m") | |
#load libraries | |
library(openxlsx) | |
library(rJava) | |
library(NLP) | |
library(openNLP) | |
library(RWeka) | |
#read text | |
text <- c("My name is Catherine Gitau, I work at Ongair Limited in Nairobi, Kenya") | |
#convert the character vectors into one character vector | |
text <- paste(text, collapse = " ") | |
print(text) | |
#converts bio variable into a string | |
text<- as.String(text) | |
#create annotators for words and sentences | |
word_ann <- Maxent_Word_Token_Annotator() | |
sent_ann <- Maxent_Sent_Token_Annotator() | |
#Identifies where the sentences are and the words | |
text_annotations <- annotate(text, list(sent_ann, word_ann)) | |
head(text_annotations) | |
#combines bio and the annotations | |
text_doc <- AnnotatedPlainTextDocument(text, text_annotations) | |
words(text_doc) | |
#creates annotators of kind person, location and organization | |
person_ann <- Maxent_Entity_Annotator(kind = "person") | |
location_ann <- Maxent_Entity_Annotator(kind = "location") | |
organization_ann <- Maxent_Entity_Annotator(kind = "organization") | |
#holds annotators in the order to be applied | |
pipeline <- list(sent_ann, | |
word_ann, | |
person_ann, | |
location_ann, | |
organization_ann) | |
text_annotations <- annotate(text, pipeline) | |
text_doc <- AnnotatedPlainTextDocument(text, text_annotations) | |
# Extract entities from an AnnotatedPlainTextDocument | |
entities <- function(text, kind) { | |
s <- text$content | |
a <- annotations(text)[[1]] | |
if(hasArg(kind)) { | |
k <- sapply(a$features, `[[`, "kind") | |
s[a[k == kind]] | |
} else { | |
s[a[a$type == "entity"]] | |
} | |
} | |
entities(text_doc, kind = "person") | |
unlist is easier yes, can be used where one doesn't really need punctuations in their analysis.. But would not advice on using it if you're looking to doing some sentiment analysis since punctuations like exclamation marks(!) etc. are needed.
Hi CateGitau,
I tried NER.r but got "error in annotations(text): can't find function "annotations"
Traceback:
- entities(text_doc, kind = "person")
the complete output was:
[1] "My name is Catherine Gitau, I work at Ongair Limited in Nairobi, Kenya"
id type start end features
1 sentence 1 70 constituents=<<integer,15>>
2 word 1 2
3 word 4 7
4 word 9 10
5 word 12 20
6 word 22 26
'My' 'name' 'is' 'Catherine' 'Gitau' ',' 'I' 'work' 'at' 'Ongair' 'Limited' 'in' 'Nairobi' ',' 'Kenya'
Error in annotations(text): non trovo la funzione "annotations"
Traceback:
- entities(text_doc, kind = "person")
Change annotations to annotation.
a <- annotations(text)[[1]]
a <- annotation(text)[[1]]
Ref: https://www.rdocumentation.org/packages/NLP/versions/0.2-1/topics/AnnotatedPlainTextDocument
Hi Catherine, thank you for the code you presented above. It helps a lot. However, when I run the code, I obtained the following results
entities(text_doc, kind = "person")
character(0)
entities(text_doc, kind = "location")
character(0)
entities(text_doc, kind = "organization")
character(0)
In the annotations, see the following example results, I hope it worked well
text_doc <- AnnotatedPlainTextDocument(text, text_annotations)
words(text_doc)
[1] "c("" "Together"
[3] "we" "can"
[5] "stop" "Floods"
[7] "in" "#"
[9] "Dar_es_salaam" "https"
[11] "://t.co/MRTeSXYghq" "."
[13] "Let" "Stop"
[15] "Floods" "from"
[17] "our\ncommunities" "in"
[19] "#" "Dar_es_salaam"
[21] "https" "://t.co/LghNMt1C1v\n\nEarly"
[23] "Today" "."
[25] "Community" "engagement"
[27] "in" "Project"
Where do you think is the problem for the final result to be as character(0)?
I am working on a twitter data and would like to get place names and nouns mentioned in the tweets so that I can geocode the places.
Thank you for helpiing
For those looking the entities function does not work because annotations was deprecated, found the solution on a YouTube comment so thought I'd cross post for those searching. You will need to install and load the coreNLP library, update annotations to read annotation and remove the brackets after the document, this should then recognize and output the entities identified when running the function with a kind specified.
Install.packages("coreNLP")
library(coreNLP)
entities <- function(doc, kind) {
s <- doc$content
a <- annotation(doc)
if(hasArg(kind)) {
k <- sapply(a$features, [[
, "kind")
s[a[k == kind]]
} else {
s[a[a$type == "entity"]]
}
}
I found an easier way instead of annotation.
You could get rid of the punctuations then use:
Or what do you think?