Skip to content

Instantly share code, notes, and snippets.

@hscells
Last active September 16, 2019 19:55
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save hscells/f08be357aec1757b231031dead3eba35 to your computer and use it in GitHub Desktop.
Save hscells/f08be357aec1757b231031dead3eba35 to your computer and use it in GitHub Desktop.
Pubmed logstash pipeline
# http://www.front2backdev.com/2015/08/26/pulling-and-splitting-live-xml/
input {
file {
path => "/Users/harryscells/Docker/sigir2017-elastic4ir/*.xml"
start_position => "beginning"
sincedb_path => "NUL"
codec => multiline {
pattern => "<PubmedArticle>|</PubmedArticleSet>"
negate => true
what => "previous"
auto_flush_interval => 1
}
}
}
filter {
if [message] == "<PubmedArticleSet>" or [message] == "</PubmedArticleSet>" {
drop {}
}
xml {
source => "message"
target => "citation"
force_array => false
}
if [citation][MedlineCitation][Article][Abstract][AbstractText][content] {
mutate {
add_field => { "abstract" => "%{[citation][MedlineCitation][Article][Abstract][AbstractText][content]}" }
}
}
mutate {
add_field => { "title" => "%{[citation][MedlineCitation][Article][ArticleTitle]}" }
add_field => { "created" => "%{[citation][MedlineCitation][DateCreated][Day]}-%{[citation][MedlineCitation][DateCreated][Month]}-%{[citation][MedlineCitation][DateCreated][Year]}" }
remove_field => ["message", "path", "@timestamp", "host"]
}
date {
match => ["created", "dd-MM-yyyy"]
target => "created"
}
}
output {
elasticsearch {
hosts => "localhost:9200"
index => "pubmed"
document_type => "medline_citation"
document_id => "%{[citation][MedlineCitation][PMID][content]}"
}
# stdout { codec => rubydebug }
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment