Skip to content

Instantly share code, notes, and snippets.

@idursun
Created January 19, 2016 18:18
Show Gist options
  • Save idursun/5922f2fdf1e0ed759428 to your computer and use it in GitHub Desktop.
Save idursun/5922f2fdf1e0ed759428 to your computer and use it in GitHub Desktop.
package parsers
import java.net.URL
import java.time.LocalDate
import core.Rezalet
import org.jsoup.Jsoup
import org.jsoup.nodes.{Element, Document}
import scala.collection.JavaConversions._
trait EksiSozlukSiteParser extends SiteParser {
private val months = Seq("ocak", "şubat", "mart", "nisan", "mayıs", "haziran", "temmuz", "ağustos", "eylül", "ekim", "kasım", "aralık")
override def source: String = "eksisozluk"
def searchPageUrl(page: Integer) = new URL(s"https://eksisozluk.com/basliklar/ara?searchForm.Keywords=rezaleti&searchForm.SortOrder=Date&p=$page")
def parseTitle(title: String): Option[TitleInfo] = {
val pattern = "(\\d{1,2})\\s(\\w+)\\s(\\d{4})\\s(.*)rezaleti".r
title match {
case pattern(day, month, year, company) if months.contains(month) =>
Option((company.trim, LocalDate.of(year.toInt, months.indexOf(month) + 1, day.toInt)))
case _ => None
}
}
override def parsePage(page: Int)(processor: RezaletProcessor): Boolean = {
val document: Document = Jsoup.parse(searchPageUrl(page), 1000)
val links = document.select("section#content-body ul.topic-list li a")
def getEntryCount(link:Element): Option[Int] = link.childNodeSize() match {
case c: Int if c > 1 => Some(link.child(0).ownText().toInt)
case _ => None
}
for {
link <- links
titleInfo <- parseTitle(link.ownText())
} processor(Rezalet(company = titleInfo._1, date = titleInfo._2, source = source, link = link.attr("href"), popularity = getEntryCount(link).getOrElse(1), isActive = true))
links.nonEmpty
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment