Skip to content

Instantly share code, notes, and snippets.

@geoHeil
Created April 4, 2017 05:17
Show Gist options
  • Save geoHeil/bfb01427b88cf58ea755f912ce539712 to your computer and use it in GitHub Desktop.
Save geoHeil/bfb01427b88cf58ea755f912ce539712 to your computer and use it in GitHub Desktop.
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
import net.ruippeixotog.scalascraper.scraper.ContentExtractors.elementList
@transient lazy val browser = JsoupBrowser()
val broken =
"""
|<docno>
| LA051089-0001
| </docno>
| <docid>
| 54901
| </docid>
| <date>
| <p> May 10, 1989, Wednesday, Home Edition </p>
| </date>
| <section>
| <p> Metro; Part 2; Page 2; Column 2 </p>
| </section>
| <graphic>
| <p> Photo, Cloudy and Clear A stormy afternoon provides a clear view of Los Angeles' skyline, with the still-emerging Library Tower rising above its companion buildings. KEN LUBAS / Los Angeles Times </p>
| </graphic>
| <type>
| <p> Wild Art </p>
| </type>
""".stripMargin
val correct =
"""
|<DOC>
|<DOCNO> FR940104-0-00001 </DOCNO>
|<PARENT> FR940104-0-00001 </PARENT>
|<TEXT>
|
|<!-- PJG FTAG 4700 -->
|
|<!-- PJG STAG 4700 -->
|
|<!-- PJG ITAG l=90 g=1 f=1 -->
|
|<!-- PJG /ITAG -->
|
|<!-- PJG ITAG l=90 g=1 f=4 -->
|Federal Register
|<!-- PJG /ITAG -->
|
|<!-- PJG ITAG l=90 g=1 f=1 -->
|&blank;/&blank;Vol. 59, No. 2&blank;/&blank;Tuesday, January 4, 1994&blank;/&blank;Rules and Regulations
|
|<!-- PJG 0012 frnewline -->
|
|<!-- PJG /ITAG -->
|
|<!-- PJG ITAG l=01 g=1 f=1 -->
|Vol. 59, No. 2
|<!-- PJG 0012 frnewline -->
|
|<!-- PJG /ITAG -->
|
|<!-- PJG ITAG l=02 g=1 f=1 -->
|Tuesday, January 4, 1994
|<!-- PJG 0012 frnewline -->
|
|<!-- PJG 0012 frnewline -->
|
|<!-- PJG /ITAG -->
|
|<!-- PJG /STAG -->
|
|<!-- PJG /FTAG -->
|</TEXT>
|</DOC>
""".stripMargin
case class RawRecords(path: String, content: String)
case class TopicContent(topic: String, content: String, filepath: String)
val raw = Seq(RawRecords("first", correct), RawRecords("second", broken))
val result = mapToTopics(raw.iterator)
// Variant 1
def mapToTopics(iterator: Iterator[RawRecords]): Iterator[TopicContent] = {
iterator.flatMap(k => {
val documents = browser.parseString(k.content) >> elementList("doc")
documents.map(d => {
val docno = d >> text("docno")
// try {
val textContent = d >> text("text")
TopicContent(docno, textContent, k.path)
// } catch {
// case _:NoSuchElementException => TopicContent(docno, None, k.path)
// }
}) //.filter(_.content !=None)
})
}
// When broken down even further you see the following will produce Options of strings
browser.parseString(raw(0).content) >> elementList("doc").map(d => {
val docno = d >> text("docno")
val textContent = d >> text("text")
(docno.headOption, textContent.headOption)
})
// while below will now map to characters. What is wrong here?
val documents = browser.parseString(raw(0).content) >> elementList("doc")
documents.map(d => {
val docno = d >> text("docno")
val textContent = d >> text("text")
(docno.headOption, textContent.headOption)
})
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment