Created
April 4, 2017 05:17
-
-
Save geoHeil/bfb01427b88cf58ea755f912ce539712 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import net.ruippeixotog.scalascraper.browser.JsoupBrowser | |
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._ | |
import net.ruippeixotog.scalascraper.dsl.DSL._ | |
import net.ruippeixotog.scalascraper.scraper.ContentExtractors.elementList | |
@transient lazy val browser = JsoupBrowser() | |
val broken = | |
""" | |
|<docno> | |
| LA051089-0001 | |
| </docno> | |
| <docid> | |
| 54901 | |
| </docid> | |
| <date> | |
| <p> May 10, 1989, Wednesday, Home Edition </p> | |
| </date> | |
| <section> | |
| <p> Metro; Part 2; Page 2; Column 2 </p> | |
| </section> | |
| <graphic> | |
| <p> Photo, Cloudy and Clear A stormy afternoon provides a clear view of Los Angeles' skyline, with the still-emerging Library Tower rising above its companion buildings. KEN LUBAS / Los Angeles Times </p> | |
| </graphic> | |
| <type> | |
| <p> Wild Art </p> | |
| </type> | |
""".stripMargin | |
val correct = | |
""" | |
|<DOC> | |
|<DOCNO> FR940104-0-00001 </DOCNO> | |
|<PARENT> FR940104-0-00001 </PARENT> | |
|<TEXT> | |
| | |
|<!-- PJG FTAG 4700 --> | |
| | |
|<!-- PJG STAG 4700 --> | |
| | |
|<!-- PJG ITAG l=90 g=1 f=1 --> | |
| | |
|<!-- PJG /ITAG --> | |
| | |
|<!-- PJG ITAG l=90 g=1 f=4 --> | |
|Federal Register | |
|<!-- PJG /ITAG --> | |
| | |
|<!-- PJG ITAG l=90 g=1 f=1 --> | |
|␣/␣Vol. 59, No. 2␣/␣Tuesday, January 4, 1994␣/␣Rules and Regulations | |
| | |
|<!-- PJG 0012 frnewline --> | |
| | |
|<!-- PJG /ITAG --> | |
| | |
|<!-- PJG ITAG l=01 g=1 f=1 --> | |
|Vol. 59, No. 2 | |
|<!-- PJG 0012 frnewline --> | |
| | |
|<!-- PJG /ITAG --> | |
| | |
|<!-- PJG ITAG l=02 g=1 f=1 --> | |
|Tuesday, January 4, 1994 | |
|<!-- PJG 0012 frnewline --> | |
| | |
|<!-- PJG 0012 frnewline --> | |
| | |
|<!-- PJG /ITAG --> | |
| | |
|<!-- PJG /STAG --> | |
| | |
|<!-- PJG /FTAG --> | |
|</TEXT> | |
|</DOC> | |
""".stripMargin | |
case class RawRecords(path: String, content: String) | |
case class TopicContent(topic: String, content: String, filepath: String) | |
val raw = Seq(RawRecords("first", correct), RawRecords("second", broken)) | |
val result = mapToTopics(raw.iterator) | |
// Variant 1 | |
def mapToTopics(iterator: Iterator[RawRecords]): Iterator[TopicContent] = { | |
iterator.flatMap(k => { | |
val documents = browser.parseString(k.content) >> elementList("doc") | |
documents.map(d => { | |
val docno = d >> text("docno") | |
// try { | |
val textContent = d >> text("text") | |
TopicContent(docno, textContent, k.path) | |
// } catch { | |
// case _:NoSuchElementException => TopicContent(docno, None, k.path) | |
// } | |
}) //.filter(_.content !=None) | |
}) | |
} | |
// When broken down even further you see the following will produce Options of strings | |
browser.parseString(raw(0).content) >> elementList("doc").map(d => { | |
val docno = d >> text("docno") | |
val textContent = d >> text("text") | |
(docno.headOption, textContent.headOption) | |
}) | |
// while below will now map to characters. What is wrong here? | |
val documents = browser.parseString(raw(0).content) >> elementList("doc") | |
documents.map(d => { | |
val docno = d >> text("docno") | |
val textContent = d >> text("text") | |
(docno.headOption, textContent.headOption) | |
}) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment