Skip to content

Instantly share code, notes, and snippets.

@krtek
Last active August 29, 2015 14:16
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save krtek/a3d9d68081ec629a5a16 to your computer and use it in GitHub Desktop.
Save krtek/a3d9d68081ec629a5a16 to your computer and use it in GitHub Desktop.
Jak vyhrát prostřeno?
{
"settings": {
"analysis": {
"analyzer": {
"cs_hunspell": {
"type": "custom",
"tokenizer": "standard",
"filter": ["stopwords_CZ", "lowercase", "hunspell_CZ", "asciifolding", "stopwords_CZ", "remove_duplicities"]
}
},
"filter": {
"stopwords_CZ": {
"type": "stop",
"stopwords": ["právě", "že", "_czech_"],
"ignore_case": true
},
"hunspell_CZ": {
"type": "hunspell",
"locale": "cs_CZ",
"dedup": true,
"recursion_level": 0
},
"remove_duplicities": {
"type": "unique",
"only_on_same_position": true
}
}
}
},
"mappings": {
"menu": {
"properties": {
"hlavni": {
"type": "string",
"analyzer": "cs_hunspell"
},
"polevka": {
"type": "string",
"analyzer": "cs_hunspell"
},
"predkrm": {
"type": "string",
"analyzer": "cs_hunspell"
},
"zakusek": {
"type": "string",
"analyzer": "cs_hunspell"
},
"desc": {
"type": "string",
"analyzer": "cs_hunspell"
}
}
}
}
}
{
"name": "Prostreno-web-scraper",
"version": "0.0.1",
"description": "Scrape Prostreno website and find winners.",
"main": "server.coffee",
"author": "Lukas Marek <lukas.marek@fragaria.cz>",
"dependencies": {
"cheerio": "latest",
"coffee-script": "^1.7.1",
"express": "latest",
"request": "latest"
}
}
express = require("express") #Webovy server
request = require("request") #Knihovna pro http requesty
cheerio = require("cheerio") #neco jako JQuery
app = express()
PRIMA_URL = "http://www.iprima.cz"
ES_CLUSTER_URL = "http://localhost:9200"
ES_INDEX = "prostreno"
ES_TYPE = "menu"
LOADED = {}
#URL nejstaršího vysílání Prostřeno
FIRST_URL='http://www.iprima.cz/prostreno/ucastnici?day=1267448067'
app.get "/scrape-all", (req, res) ->
_contestants(FIRST_URL)
res.send('Check your console!')
app.listen "8081"
console.log "Run http://localhost:8081/scrape-all"
exports = module.exports = app
###
Načte stránku s účastníky, najde všechny účastníky a spustí pro
ně načítání menu.
Poté najde link na další stránku a rekurzivně se pustí znovu.
###
_contestants = (url) ->
request url, (error, response, html) ->
unless error
$ = cheerio.load(html)
# Najde všechny účastníky
$('.prostreno-contestant').each (index, contestant) ->
#Zkontroluje, zda je to vítěz
winner_flag = $(contestant).hasClass('tvwinner')
suffix = $(contestant).find('a').attr('href')
menuUrl = "#{PRIMA_URL}#{suffix}"
if _isValid(menuUrl)
LOADED[menuUrl] = true
id = suffix.substring(suffix.lastIndexOf("/")+ 1)
#načti menu
_menu(menuUrl, id, winner_flag)
#Find link to next page
$('div.pager-next a').each (index, element) ->
suffix = $(element).attr('href')
url = "#{PRIMA_URL}#{suffix}"
_contestants(url)
###
Zkontroluje, jestli odkaz na stránku menu je platný
a nejde o reprízu pořadu.
###
_isValid = (url) ->
return not LOADED[url] and
url?.indexOf("repriza") == -1 and
url?.indexOf("discussion") == -1 and
url?.indexOf("-0") == -1 and
url?.indexOf("-1") == -1
###
Načte menu jednoho účastníka, připraví ho jako objekt
a uloží do Elasticsearch.
###
_menu = (url, id, winner) ->
request url, (error, response, html) ->
if (html)
$ = cheerio.load(html)
menu = []
$('div.prostreno-contestant-recipe h3.title').each((index, item) ->
menu.push($(item).text().trim())
)
if menu.length == 3
menu = {
predkrm: menu[0],
hlavni: menu[1],
zakusek: menu[2]
}
else if menu.length == 4
menu = {
predkrm: menu[0],
polevka: menu[1],
hlavni: menu[2],
zakusek: menu[3]
}
else if menu.length == 5
menu = {
predkrm: menu[1],
polevka: menu[2],
hlavni: menu[3],
zakusek: menu[4]
}
else
console.log("Invalid menu: #{url}", menu)
return
menu.id = id
menu.winner = winner
menu.desc = $('.node-text').text()
_save(menu)
###
Uloží menu do Elasticsearch
###
_save = (menu) ->
options = {
url: "#{ES_CLUSTER_URL}/#{ES_INDEX}/#{ES_TYPE}/#{menu.id}"
body: menu,
json: true,
method: 'put'
}
request options, (err,httpResponse,body) ->
if err
console.log(err, body)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment