Last active
June 4, 2018 12:37
-
-
Save Kyonru/5fda92bcaf0bff6eaf44ce84f1708d3a to your computer and use it in GitHub Desktop.
Extract the results of the loteries in Dominican Republic from http://leidsa.com/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Extract the body of the hmtl page | |
* @param {string} html full html of the page | |
*/ | |
export const extractBody = (html) => { | |
const startIndex = html.indexOf('<body>'); | |
const endIndex = html.indexOf('</body>'); | |
return html.substring(startIndex, endIndex); | |
}; | |
/** | |
* Extract the lotomas part of the page | |
* @param {string} body full body of the hmtl | |
*/ | |
export const extractPrincipal = (body) => { | |
const startOfChunk = '<div class="form-group numeros-ganadores-pc">'; | |
const endOfChunk = '</table>'; | |
const extraction = extract(body, startOfChunk, endOfChunk); | |
return { | |
chunk: extraction.chunk, | |
newBody: extraction.newString | |
}; | |
}; | |
/** | |
* Extract all the loteries | |
* @param {string} body full hmtl body | |
* @param {array} chunkArr array of loteries | |
*/ | |
export const extractChunks = (body, chunkArr = []) => { | |
const principal = extractPrincipal(body); | |
if (principal.chunk) { | |
body = principal.newBody; | |
chunkArr.push(parsePrincipalChunkToObject(principal.chunk, body)); | |
} | |
const startOfChunk = '<div class="panel panel-default">'; | |
const endOfChunk = '</table>'; | |
const extraction = extract(body, startOfChunk, endOfChunk); | |
if (extraction.startIndex > -1) { | |
if (extraction.chunk.indexOf(startOfChunk) > -1) { | |
if (extraction.chunk) { | |
chunkArr.push(parseChunkToObject(extraction.chunk)); | |
} | |
} | |
return extractChunks(extraction.newString, chunkArr); | |
} | |
return chunkArr.filter((item) => item !== undefined); | |
}; | |
/** | |
* parse the principal number in the leidsa page to the common format | |
* @param {string} chunk container of the principal numbers | |
* @param {string} body full body of the html | |
*/ | |
export const parsePrincipalChunkToObject = (chunk, body) => { | |
const numberStart = '<span class="numeros-ganadores'; | |
const numberEnd = '</span>'; | |
const numberExtra = ' numero-ganador-principal'; | |
const symbol = '">'; | |
const dateStart = '<p class="resultados-del-dia">'; | |
const dateEnd = '<div class="form-group numeros-ganadores-pc">'; | |
const rawDate = extract(body, dateStart, dateEnd); | |
const date = extract(rawDate.chunk, 'Resultados del ', '</p>'); | |
return { | |
logo: 'https://i.imgur.com/zm0wRDQ.png', | |
numbers: extractNumbers(chunk, [], numberStart, numberEnd, (number) => number.replace(numberExtra, '').replace(symbol, '')), | |
time: '', | |
date: date.chunk | |
}; | |
}; | |
/** | |
* Parse the html to and object | |
* @param {string} chunk | |
*/ | |
export const parseChunkToObject = (chunk) => { | |
return { | |
logo: extractImage(chunk), | |
numbers: extractNumbers(chunk), | |
date: extractDate(chunk).date, | |
time: extractDate(chunk).time | |
}; | |
}; | |
/** | |
* Extract the number of the chunk | |
* @param {string} chunk portion of the string with has the numbers | |
* @param {array} numbers array of number for that chunk | |
* @param {string} numberStart start limit | |
* @param {string} numberEnd end limit | |
* @param {func} filter apply a filter to the string | |
*/ | |
const extractNumbers = (chunk, numbers = [], numberStart = '<td class="numeros-ganadores-loterias">', numberEnd = '</td>', filter = (number) => number) => { | |
const extraction = extract(chunk, numberStart, numberEnd); | |
if (extraction.startIndex > -1) { | |
if (extraction.chunk.indexOf(numberStart) > -1) { | |
const number = extraction.chunk.replace(numberStart, ''); | |
if (number.length > 0) { | |
numbers.push(filter(number)); | |
} | |
} | |
extractNumbers(extraction.newString, numbers, numberStart, numberEnd, filter); | |
} | |
return numbers; | |
}; | |
/** | |
* Stract the image | |
* @param {string} chunk | |
*/ | |
const extractImage = (chunk) => { | |
const imageStart = '<img class="logo-loteria" src="'; | |
const imageEnd = '.png"/>'; | |
return `${extract(chunk, imageStart, imageEnd).chunk.replace(imageStart, '')}.png`; | |
}; | |
/** | |
* stract the date | |
* @param {String} chunk | |
*/ | |
const extractDate = (chunk) => { | |
const timeStart = '<strong>'; | |
const dateStart = '</strong>'; | |
const dateEnd = '</p>'; | |
const date = extract(chunk, dateStart, dateEnd).chunk.replace(dateStart, ''); | |
const time = extract(chunk, timeStart, dateStart).chunk.replace(timeStart, '').replace('Sorteo', '').replace(':', '').trim(); | |
return { | |
time: `${time}`, | |
date: `${date}` | |
}; | |
}; | |
/** | |
* Extract a portion of the string given the limits | |
* @param {sttring} string complete body to be searched in | |
* @param {string} start start of the searched string | |
* @param {string} end final part of the searched string | |
*/ | |
const extract = (string, start, end) => { | |
const startIndex = string.indexOf(start); | |
const endIndex = string.indexOf(end); | |
let chunk; | |
let newString = string; | |
if (startIndex > -1) { | |
chunk = string.substring(startIndex, endIndex); | |
newString = string.replace(chunk, ''); | |
} | |
return { | |
chunk: chunk, | |
newString: newString, | |
startIndex, | |
endIndex | |
}; | |
}; | |
// Convert the html text string in an array of loteries | |
export const htmlToJson = (html) => { | |
return extractChunks(extractBody(html)); | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment