Skip to content

Instantly share code, notes, and snippets.

@molekilla
Created March 28, 2012 22:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save molekilla/2231169 to your computer and use it in GitHub Desktop.
Save molekilla/2231169 to your computer and use it in GitHub Desktop.
Escuelita de Scala - Parte 4b - Indexamiento
// RM: https://github.com/molekilla/ScalaRegPubRobot/blob/master/src/com/ecyware/web/robot/RegPubStorage.scala
def saveOrUpdate(items:Map[String,Object]) {
// webdata is database
// regpub is collection
val ficha = if ( items.containsKey("ficha") ) Some(items("ficha").toString) else None
if ( ficha.isDefined ) {
val document = items.asDBObject
val existingItems = mongoCollection.find(document)
if ( existingItems.length == 0 )
{
mongoCollection += document
println(String.format("Ficha added: %s", ficha.get))
}
RM: ElasticSearch index
this.indexRegPubDocument(document, ficha.get)
}
}
// RM: https://github.com/molekilla/ScalaRegPubRobot/blob/master/src/com/ecyware/web/robot/RegPubParser.scala
def parseCompany(url:String, html:String):Store =
{
if ( url.endsWith("&ID=0"))
{
Store("", Map.empty[String, String])
}
try {
val document = Jsoup.parse(html)
val page = document.select("div[id=CONSULTA]").select("table[border=0]").toSeq()
// Ficha y Documento
val fichaDocumentoTable = page.get(0).toTableCellsSeq
var nombre = page.get(2).toTableCellsSeq
val tomoTable = page.get(3).toTableCellsSeq
val fechaRegistroTable = page.get(4).toTableCellsSeq
val escrituraTable = page.get(5).toTableCellsSeq
val notariaTable = page.get(6).toTableCellsSeq
val provinciaNotariaTable = page.get(7).toTableCellsSeq
val domicilioTable = page.get(8).toTableCellsSeq
val prendaTable = page.get(9).toTableCellsSeq
val tasaUnicaTable = page.get(11).toTableCellsSeq
val tasaUnicaAgenteResidente = page.get(12).toTableCellsSeq().get(1).text.trim
val diarioTable = page.get(14).toTableCellsSeq()
val idMap = Map(
"ficha" -> fichaDocumentoTable.get(1).text.trim,
"documento" -> fichaDocumentoTable.get(3).text.trim,
"nombre" -> nombre.get(0).text.trim,
"tomo" -> tomoTable.get(1).text.trim,
"folio" -> tomoTable.get(3).text.trim,
"asiento" -> tomoTable.get(5).text.trim,
"fechaRegistro" -> fechaRegistroTable.get(1).text.trim,
"estado" -> fechaRegistroTable.get(3).text.trim,
"fechaEscritura" -> escrituraTable.get(3).text.trim,
"escritura" -> escrituraTable.get(1).text.trim,
"notaria" -> (notariaTable.get(1).text.trim + " " + notariaTable.get(2).text.trim),
"provinciaNotaria" -> provinciaNotariaTable.get(1).text.trim,
"duracion" -> domicilioTable.get(1).text.trim,
"domicilio" -> domicilioTable.get(3).text.trim,
"prenda" -> (prendaTable.get(1).text.trim() + prendaTable.get(2).text.trim),
"tasaUnicaBoleta" -> tasaUnicaTable.get(1).text.trim,
"tasaUnicaFechaPago" -> tasaUnicaTable.get(3).text.trim,
"tasaUnicaAgenteResidente" -> tasaUnicaAgenteResidente,
"diarioTomo" -> diarioTable.get(1).text.trim,
"diarioAsiento" -> diarioTable.get(3).text.trim,
"dignatarios" -> page.getDignatarios,
"subscriptores" -> page.getSubscribers,
"directores" -> page.getDirectors
) ++ page.getMicro
Store(url, idMap)
}
catch {
case e: Exception =>
Store("", Map.empty[String, String])
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment