Last active
August 29, 2015 14:16
-
-
Save krtek/a3d9d68081ec629a5a16 to your computer and use it in GitHub Desktop.
Jak vyhrát prostřeno?
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"settings": { | |
"analysis": { | |
"analyzer": { | |
"cs_hunspell": { | |
"type": "custom", | |
"tokenizer": "standard", | |
"filter": ["stopwords_CZ", "lowercase", "hunspell_CZ", "asciifolding", "stopwords_CZ", "remove_duplicities"] | |
} | |
}, | |
"filter": { | |
"stopwords_CZ": { | |
"type": "stop", | |
"stopwords": ["právě", "že", "_czech_"], | |
"ignore_case": true | |
}, | |
"hunspell_CZ": { | |
"type": "hunspell", | |
"locale": "cs_CZ", | |
"dedup": true, | |
"recursion_level": 0 | |
}, | |
"remove_duplicities": { | |
"type": "unique", | |
"only_on_same_position": true | |
} | |
} | |
} | |
}, | |
"mappings": { | |
"menu": { | |
"properties": { | |
"hlavni": { | |
"type": "string", | |
"analyzer": "cs_hunspell" | |
}, | |
"polevka": { | |
"type": "string", | |
"analyzer": "cs_hunspell" | |
}, | |
"predkrm": { | |
"type": "string", | |
"analyzer": "cs_hunspell" | |
}, | |
"zakusek": { | |
"type": "string", | |
"analyzer": "cs_hunspell" | |
}, | |
"desc": { | |
"type": "string", | |
"analyzer": "cs_hunspell" | |
} | |
} | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"name": "Prostreno-web-scraper", | |
"version": "0.0.1", | |
"description": "Scrape Prostreno website and find winners.", | |
"main": "server.coffee", | |
"author": "Lukas Marek <lukas.marek@fragaria.cz>", | |
"dependencies": { | |
"cheerio": "latest", | |
"coffee-script": "^1.7.1", | |
"express": "latest", | |
"request": "latest" | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
express = require("express") #Webovy server | |
request = require("request") #Knihovna pro http requesty | |
cheerio = require("cheerio") #neco jako JQuery | |
app = express() | |
PRIMA_URL = "http://www.iprima.cz" | |
ES_CLUSTER_URL = "http://localhost:9200" | |
ES_INDEX = "prostreno" | |
ES_TYPE = "menu" | |
LOADED = {} | |
#URL nejstaršího vysílání Prostřeno | |
FIRST_URL='http://www.iprima.cz/prostreno/ucastnici?day=1267448067' | |
app.get "/scrape-all", (req, res) -> | |
_contestants(FIRST_URL) | |
res.send('Check your console!') | |
app.listen "8081" | |
console.log "Run http://localhost:8081/scrape-all" | |
exports = module.exports = app | |
### | |
Načte stránku s účastníky, najde všechny účastníky a spustí pro | |
ně načítání menu. | |
Poté najde link na další stránku a rekurzivně se pustí znovu. | |
### | |
_contestants = (url) -> | |
request url, (error, response, html) -> | |
unless error | |
$ = cheerio.load(html) | |
# Najde všechny účastníky | |
$('.prostreno-contestant').each (index, contestant) -> | |
#Zkontroluje, zda je to vítěz | |
winner_flag = $(contestant).hasClass('tvwinner') | |
suffix = $(contestant).find('a').attr('href') | |
menuUrl = "#{PRIMA_URL}#{suffix}" | |
if _isValid(menuUrl) | |
LOADED[menuUrl] = true | |
id = suffix.substring(suffix.lastIndexOf("/")+ 1) | |
#načti menu | |
_menu(menuUrl, id, winner_flag) | |
#Find link to next page | |
$('div.pager-next a').each (index, element) -> | |
suffix = $(element).attr('href') | |
url = "#{PRIMA_URL}#{suffix}" | |
_contestants(url) | |
### | |
Zkontroluje, jestli odkaz na stránku menu je platný | |
a nejde o reprízu pořadu. | |
### | |
_isValid = (url) -> | |
return not LOADED[url] and | |
url?.indexOf("repriza") == -1 and | |
url?.indexOf("discussion") == -1 and | |
url?.indexOf("-0") == -1 and | |
url?.indexOf("-1") == -1 | |
### | |
Načte menu jednoho účastníka, připraví ho jako objekt | |
a uloží do Elasticsearch. | |
### | |
_menu = (url, id, winner) -> | |
request url, (error, response, html) -> | |
if (html) | |
$ = cheerio.load(html) | |
menu = [] | |
$('div.prostreno-contestant-recipe h3.title').each((index, item) -> | |
menu.push($(item).text().trim()) | |
) | |
if menu.length == 3 | |
menu = { | |
predkrm: menu[0], | |
hlavni: menu[1], | |
zakusek: menu[2] | |
} | |
else if menu.length == 4 | |
menu = { | |
predkrm: menu[0], | |
polevka: menu[1], | |
hlavni: menu[2], | |
zakusek: menu[3] | |
} | |
else if menu.length == 5 | |
menu = { | |
predkrm: menu[1], | |
polevka: menu[2], | |
hlavni: menu[3], | |
zakusek: menu[4] | |
} | |
else | |
console.log("Invalid menu: #{url}", menu) | |
return | |
menu.id = id | |
menu.winner = winner | |
menu.desc = $('.node-text').text() | |
_save(menu) | |
### | |
Uloží menu do Elasticsearch | |
### | |
_save = (menu) -> | |
options = { | |
url: "#{ES_CLUSTER_URL}/#{ES_INDEX}/#{ES_TYPE}/#{menu.id}" | |
body: menu, | |
json: true, | |
method: 'put' | |
} | |
request options, (err,httpResponse,body) -> | |
if err | |
console.log(err, body) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment