Skip to content

Instantly share code, notes, and snippets.

@DanyMariaLee
Last active June 23, 2018 02:00
Show Gist options
  • Save DanyMariaLee/7f0ad7263635cb4e82f8467c1f969176 to your computer and use it in GitHub Desktop.
Save DanyMariaLee/7f0ad7263635cb4e82f8467c1f969176 to your computer and use it in GitHub Desktop.
Parsing html with #scala-scraper
package ms.podium.services
import scala.util.Try
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
import net.ruippeixotog.scalascraper.dsl.DSL.Extract._
import net.ruippeixotog.scalascraper.dsl.DSL._
/*
Yet another example of scala-scrapper usage.
The goal: get list of countries from wikipedia.org,
then for every country get the path for
list of cities for each country.
Original page https://en.wikipedia.org/wiki/Lists_of_cities_by_country
Paths for city lists:
- https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_COUNTRY_NAME
- https://en.wikipedia.org/wiki/List_of_cities_in_COUNTRY_NAME
*/
trait ParseWiki {
val browser = JsoupBrowser()
val countriesUrl = """https://en.wikipedia.org/wiki/Lists_of_cities_by_country"""
def cityUrl0(country: String) =
s"""https://en.wikipedia.org/wiki/List_of_cities_and_towns_in_$country"""
def cityUrl1(country: String) =
s"""https://en.wikipedia.org/wiki/List_of_cities_in_$country"""
def getCountries: Vector[String] = {
val doc = browser.get(countriesUrl)
val listOfCitiesInNames = doc >> ".mw-body-content b"
listOfCitiesInNames.map(parseCountry).filter(_.nonEmpty).toVector
}
def getCities(country: String): Vector[String] = {
val optDoc = Try(browser.get(cityUrl0(country)))
.orElse(Try(browser.get(cityUrl1(country))))
.toOption
optDoc.map { doc =>
val table = doc >> element("table")
table.flatMap(e => e >> "td a").zipWithIndex
.filter(ci => ci._2 % 2 == 0).map(_._1).toVector
}
.getOrElse(Vector.empty)
}
def parseCountry(str: String): String = {
val countryPattern = """List of cities in ([a-zA-Z]+)""".r
str match {
case countryPattern(country) => country
case _ => ""
}
}
val cityByCountry = getCountries.map(c =>
c -> getCities(c))
cityByCountry :+ ("Vatican City" -> Vector("Vatican City"))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment