Kyonru/leidsa_extractor.js

## leidsa_extractor.js
/**
 * Extract the body of the hmtl page
 * @param {string} html full html of the page
 */
export const extractBody = (html) => {
  const startIndex = html.indexOf('<body>');
  const endIndex = html.indexOf('</body>');

  return html.substring(startIndex, endIndex);
};

/**
 * Extract the lotomas part of the page
 * @param {string} body full body of the hmtl
 */
export const extractPrincipal = (body) => {
  const startOfChunk = '<div class="form-group numeros-ganadores-pc">';
  const endOfChunk = '</table>';
  const extraction = extract(body, startOfChunk, endOfChunk);
  return {
    chunk: extraction.chunk,
    newBody: extraction.newString
  };
};

/**
 * Extract all the loteries
 * @param {string} body full hmtl body
 * @param {array} chunkArr array of loteries
 */
export const extractChunks = (body, chunkArr = []) => {
  const principal = extractPrincipal(body);
  if (principal.chunk) {
    body = principal.newBody;
    chunkArr.push(parsePrincipalChunkToObject(principal.chunk, body));
  }

  const startOfChunk = '<div class="panel panel-default">';
  const endOfChunk = '</table>';
  const extraction = extract(body, startOfChunk, endOfChunk);
  if (extraction.startIndex > -1) {
    if (extraction.chunk.indexOf(startOfChunk) > -1) {
      if (extraction.chunk) {
        chunkArr.push(parseChunkToObject(extraction.chunk));
      }
    }
    return extractChunks(extraction.newString, chunkArr);
  }
  return chunkArr.filter((item) => item !== undefined);
};

/**
 * parse the principal number in the leidsa page to the common format
 * @param {string} chunk container of the principal numbers
 * @param {string} body full body of the html
 */
export const parsePrincipalChunkToObject = (chunk, body) => {
  const numberStart = '<span class="numeros-ganadores';
  const numberEnd = '</span>';
  const numberExtra = ' numero-ganador-principal';
  const symbol = '">';

  const dateStart = '<p class="resultados-del-dia">';
  const dateEnd = '<div class="form-group numeros-ganadores-pc">';

  const rawDate = extract(body, dateStart, dateEnd);
  const date = extract(rawDate.chunk, 'Resultados del ', '</p>');
  return {
    logo: 'https://i.imgur.com/zm0wRDQ.png',
    numbers: extractNumbers(chunk, [], numberStart, numberEnd, (number) => number.replace(numberExtra, '').replace(symbol, '')),
    time: '',
    date: date.chunk
  };
};

/**
 * Parse the html to and object
 * @param {string} chunk
 */
export const parseChunkToObject = (chunk) => {
  return {
    logo: extractImage(chunk),
    numbers: extractNumbers(chunk),
    date: extractDate(chunk).date,
    time: extractDate(chunk).time
  };
};

/**
 * Extract the number of the chunk
 * @param {string} chunk portion of the string with has the numbers
 * @param {array} numbers array of number for that chunk
 * @param {string} numberStart start limit
 * @param {string} numberEnd end limit
 * @param {func} filter apply a filter to the string
 */
const extractNumbers = (chunk, numbers = [], numberStart = '<td class="numeros-ganadores-loterias">', numberEnd = '</td>', filter = (number) => number) => {
  const extraction = extract(chunk, numberStart, numberEnd);
  if (extraction.startIndex > -1) {
    if (extraction.chunk.indexOf(numberStart) > -1) {
      const number = extraction.chunk.replace(numberStart, '');
      if (number.length > 0) {
        numbers.push(filter(number));
      }
    }
    extractNumbers(extraction.newString, numbers, numberStart, numberEnd, filter);
  }
  return numbers;
};

/**
 * Stract the image
 * @param {string} chunk
 */
const extractImage = (chunk) => {
  const imageStart = '<img  class="logo-loteria" src="';
  const imageEnd = '.png"/>';
  return `${extract(chunk, imageStart, imageEnd).chunk.replace(imageStart, '')}.png`;
};

/**
 * stract the date
 * @param {String} chunk
 */
const extractDate = (chunk) => {
  const timeStart = '<strong>';
  const dateStart = '</strong>';
  const dateEnd = '</p>';
  const date = extract(chunk, dateStart, dateEnd).chunk.replace(dateStart, '');
  const time = extract(chunk, timeStart, dateStart).chunk.replace(timeStart, '').replace('Sorteo', '').replace(':', '').trim();
  return {
    time: `${time}`,
    date: `${date}`
  };
};


/**
 * Extract a portion of the string given the limits
 * @param {sttring} string complete body to be searched in
 * @param {string} start start of the searched string
 * @param {string} end final part of the searched string
 */
const extract = (string, start, end) => {
  const startIndex = string.indexOf(start);
  const endIndex = string.indexOf(end);
  let chunk;
  let newString = string;
  if (startIndex > -1) {
    chunk = string.substring(startIndex, endIndex);
    newString = string.replace(chunk, '');
  }
  return {
    chunk: chunk,
    newString: newString,
    startIndex,
    endIndex
  };
};

// Convert the html text string in an array of loteries
export const htmlToJson = (html) => {
  return extractChunks(extractBody(html));
};
	/**
	* Extract the body of the hmtl page
	* @param {string} html full html of the page
	*/
	export const extractBody = (html) => {
	const startIndex = html.indexOf('<body>');
	const endIndex = html.indexOf('</body>');

	return html.substring(startIndex, endIndex);
	};

	/**
	* Extract the lotomas part of the page
	* @param {string} body full body of the hmtl
	*/
	export const extractPrincipal = (body) => {
	const startOfChunk = '<div class="form-group numeros-ganadores-pc">';
	const endOfChunk = '</table>';
	const extraction = extract(body, startOfChunk, endOfChunk);
	return {
	chunk: extraction.chunk,
	newBody: extraction.newString
	};
	};

	/**
	* Extract all the loteries
	* @param {string} body full hmtl body
	* @param {array} chunkArr array of loteries
	*/
	export const extractChunks = (body, chunkArr = []) => {
	const principal = extractPrincipal(body);
	if (principal.chunk) {
	body = principal.newBody;
	chunkArr.push(parsePrincipalChunkToObject(principal.chunk, body));
	}

	const startOfChunk = '<div class="panel panel-default">';
	const endOfChunk = '</table>';
	const extraction = extract(body, startOfChunk, endOfChunk);
	if (extraction.startIndex > -1) {
	if (extraction.chunk.indexOf(startOfChunk) > -1) {
	if (extraction.chunk) {
	chunkArr.push(parseChunkToObject(extraction.chunk));
	}
	}
	return extractChunks(extraction.newString, chunkArr);
	}
	return chunkArr.filter((item) => item !== undefined);
	};

	/**
	* parse the principal number in the leidsa page to the common format
	* @param {string} chunk container of the principal numbers
	* @param {string} body full body of the html
	*/
	export const parsePrincipalChunkToObject = (chunk, body) => {
	const numberStart = '<span class="numeros-ganadores';
	const numberEnd = '</span>';
	const numberExtra = ' numero-ganador-principal';
	const symbol = '">';

	const dateStart = '<p class="resultados-del-dia">';
	const dateEnd = '<div class="form-group numeros-ganadores-pc">';

	const rawDate = extract(body, dateStart, dateEnd);
	const date = extract(rawDate.chunk, 'Resultados del ', '</p>');
	return {
	logo: 'https://i.imgur.com/zm0wRDQ.png',
	numbers: extractNumbers(chunk, [], numberStart, numberEnd, (number) => number.replace(numberExtra, '').replace(symbol, '')),
	time: '',
	date: date.chunk
	};
	};

	/**
	* Parse the html to and object
	* @param {string} chunk
	*/
	export const parseChunkToObject = (chunk) => {
	return {
	logo: extractImage(chunk),
	numbers: extractNumbers(chunk),
	date: extractDate(chunk).date,
	time: extractDate(chunk).time
	};
	};

	/**
	* Extract the number of the chunk
	* @param {string} chunk portion of the string with has the numbers
	* @param {array} numbers array of number for that chunk
	* @param {string} numberStart start limit
	* @param {string} numberEnd end limit
	* @param {func} filter apply a filter to the string
	*/
	const extractNumbers = (chunk, numbers = [], numberStart = '<td class="numeros-ganadores-loterias">', numberEnd = '</td>', filter = (number) => number) => {
	const extraction = extract(chunk, numberStart, numberEnd);
	if (extraction.startIndex > -1) {
	if (extraction.chunk.indexOf(numberStart) > -1) {
	const number = extraction.chunk.replace(numberStart, '');
	if (number.length > 0) {
	numbers.push(filter(number));
	}
	}
	extractNumbers(extraction.newString, numbers, numberStart, numberEnd, filter);
	}
	return numbers;
	};

	/**
	* Stract the image
	* @param {string} chunk
	*/
	const extractImage = (chunk) => {
	const imageStart = '<img class="logo-loteria" src="';
	const imageEnd = '.png"/>';
	return `${extract(chunk, imageStart, imageEnd).chunk.replace(imageStart, '')}.png`;
	};

	/**
	* stract the date
	* @param {String} chunk
	*/
	const extractDate = (chunk) => {
	const timeStart = '<strong>';
	const dateStart = '</strong>';
	const dateEnd = '</p>';
	const date = extract(chunk, dateStart, dateEnd).chunk.replace(dateStart, '');
	const time = extract(chunk, timeStart, dateStart).chunk.replace(timeStart, '').replace('Sorteo', '').replace(':', '').trim();
	return {
	time: `${time}`,
	date: `${date}`
	};
	};


	/**
	* Extract a portion of the string given the limits
	* @param {sttring} string complete body to be searched in
	* @param {string} start start of the searched string
	* @param {string} end final part of the searched string
	*/
	const extract = (string, start, end) => {
	const startIndex = string.indexOf(start);
	const endIndex = string.indexOf(end);
	let chunk;
	let newString = string;
	if (startIndex > -1) {
	chunk = string.substring(startIndex, endIndex);
	newString = string.replace(chunk, '');
	}
	return {
	chunk: chunk,
	newString: newString,
	startIndex,
	endIndex
	};
	};

	// Convert the html text string in an array of loteries
	export const htmlToJson = (html) => {
	return extractChunks(extractBody(html));
	};