Skip to content

Instantly share code, notes, and snippets.

@CliffordAnderson
Last active May 20, 2020 23:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save CliffordAnderson/e1b1db2819df586a0b38a42903288012 to your computer and use it in GitHub Desktop.
Save CliffordAnderson/e1b1db2819df586a0b38a42903288012 to your computer and use it in GitHub Desktop.
Text Mining at Scale (XQuery Working Group) Natural Language Processing

Natural Language Processing

Today, we’ll be exploring patterns in a corpus of genuine and fake news collected in 2016 by Buzz Feed and scored for veracity by professional journalists. As you might imagine, the corpus contains very partisan perspectives; individual articles may contain disturbing language and viewpoints. In the initial code example below, you will need to have downloaded the data set and have created a database called articles.

We’ll begin our investigation of natural language processing by using Aylien, which bill itself as a “News Intelligence Platform,” to classify these articles, analyze their topics, identify the people, places, things they discuss, and to discern the sentiment or tone of the articles. If you would like to follow along, please sign up for a free API key.

xquery version "3.1";
declare function local:clean-syntax($mainText as xs:string?) as xs:string {
let $tokens :=
for $token in fn:tokenize($mainText, " ")
let $token :=
fn:string-join(
fn:replace($token, "Â", "")
=> fn:translate( "’", "'")
=> fn:replace("'œ", "'")
=> fn:replace("˜","")
=> fn:replace("","")
=> fn:replace("”","")
)
return $token
return fn:normalize-space(fn:string-join($tokens, " "))
};
let $records :=
for $doc at $count in fn:collection("articles")
let $docId := fn:base-uri($doc)
let $articleTitle := $doc/article/title/text() => local:clean-syntax()
let $articleAuthor := $doc/article/author/text()
let $articleText := $doc/article/mainText/text() => local:clean-syntax()
let $articleURI := $doc/article/uri/text()
let $articleVeracity := $doc/article/veracity/text()
return
<record id="{"article" || $count}">
<docId>{$docId}</docId>
<articleTitle>{$articleTitle}</articleTitle>
<articleAuthor>{$articleAuthor}</articleAuthor>
<articleVeracity>{$articleVeracity}</articleVeracity>
<articleURI>{$articleURI}</articleURI>
<articleText>{$articleText}</articleText>
</record>
return db:create("fakenews", $records, (for $num in (1 to fn:count($records)) return "article" || $num || ".xml") )
xquery version "3.1";
(: List of news sources in corpus :)
for $source in fn:distinct-values(
for $record in fn:collection("fakenews")
for $uri in $record/record/articleURI/text()
return fn:replace($uri, "https?://(.*?)/.*", "$1")
)
order by $source
return $source
xquery version "3.1";
(: Order news sources by veracity ranking :)
for $record in fn:collection("fakenews")
let $uris := $record/record/articleURI/text() => fn:replace("https?://(.*?)/.*", "$1")
group by $veracity := $record/record/articleVeracity/text()
return
<veracity level="{$veracity}">
{
for $uri in distinct-values($uris)
let $count := fn:count($uris[.=$uri])
order by $count descending
return <publication name="{$uri}" number="{$count}"/>
}
</veracity>
xquery version "3.1";
(: Evaluate the sentiment of an article using Aylien NLP :)
declare function local:send-request($text as xs:string?, $service as xs:string) {
let $appid := "###"
let $key := "###"
let $endpoint := "https://api.aylien.com/api/v1/"
let $text := fn:encode-for-uri($text)
let $request :=
<http:request method="get" href="{$endpoint || $service || '?text=' || $text}">
<http:header name="Accept" value="text/xml"/>
<http:header name="X-AYLIEN-TextAPI-Application-Key" value="{$key}"/>
<http:header name="X-AYLIEN-TextAPI-Application-ID" value="{$appid}"/>
</http:request>
return http:send-request($request)
};
let $article := fn:doc("fakenews/article1.xml")
let $service := "sentiment"
return local:send-request($article/record/articleText, $service)
xquery version "3.1";
(: Classify articles with the Aylien NLP :)
let $article := fn:doc("fakenews/article1.xml")
let $service := "classify/iptc-subjectcode"
return local:send-request($article/record/articleText, $service)
xquery version "3.1";
(: Get dates for articles with the Aylien NLP :)
let $article := fn:doc("fakenews/article1.xml")
let $service := "entities"
return local:send-request($article/record/articleText, $service)/result/entities/entity[@type="date"]//text()
xquery version "3.1";
(: Summarize an article using Aylien NLP :)
declare function local:summarize($text as xs:string?, $title as xs:string?, $sentences_number as xs:integer) {
let $appid := "###"
let $key := "###"
let $endpoint := "https://api.aylien.com/api/v1/summarize/"
let $text := fn:encode-for-uri($text)
let $title := fn:encode-for-uri($title)
let $request :=
<http:request method="get" href="{$endpoint || '?text=' || $text || '&amp;title=' || $title || '&amp;sentences_number=' || $sentences_number }">
<http:header name="Accept" value="text/xml"/>
<http:header name="X-AYLIEN-TextAPI-Application-Key" value="{$key}"/>
<http:header name="X-AYLIEN-TextAPI-Application-ID" value="{$appid}"/>
</http:request>
return http:send-request($request)
};
let $article := fn:doc("fakenews/article1.xml")
return local:summarize($article/record/articleText, $article/record/articleTitle, 1)//sentence
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment