Skip to content

Instantly share code, notes, and snippets.

@toraritte
Last active December 21, 2022 00:06
Show Gist options
  • Save toraritte/a47ad823dd1b0769aac1bdf06191e84f to your computer and use it in GitHub Desktop.
Save toraritte/a47ad823dd1b0769aac1bdf06191e84f to your computer and use it in GitHub Desktop.
scrape the target weekly ads into a (hopefully) coherent human-readable text (see comments on how to test drive it)
/* ============================================================================= */
function nullToEmptyString(maybeNull) {
return maybeNull ? maybeNull : '';
}
function stripPeriod(itemProperty) {
const notNullItemProperty = nullToEmptyString(itemProperty);
return notNullItemProperty.replace(/\.(\s|\*|†)*$/, '');
}
function addDollarSignIfPriceIsNumber(itemProperty) {
const notNullItemProperty = nullToEmptyString(itemProperty);
if (Number(notNullItemProperty)) {
return `$${notNullItemProperty}`;
} else {
return `${notNullItemProperty}`;
}
}
function formatFinePrint(finePrint) {
let newFinePrint =
finePrint.
/* The purpose of this is to be able to filter out 'Expect More. Pay Less.` from `page_description` on the "company messaging" page. */
replace(/(More|Less)\./g, '').
replace(/No. /g, 'Product number: ').
replace(/\/mo\./g, ' per month').
split(/(\.\s+|\*|†)/).
filter( str => !str.match(/(©|reserved|trademark|property|countr)/i) ).
filter( str => str.length > 5 ).
filter( str => !str.match(/Target Circle/) ).
filter( str => !str.match(/help\.target\.com/g) ).
join('; ');
//if ( newFinePrint.length < 10 ) { console.log(newFinePrint.length); }
if (newFinePrint.length > 1000) {
return '';
} else {
return (newFinePrint.length === 0) ? '' : `; Fine print: ${newFinePrint}`;
}
}
function makeItemScript(item) {
const newTitle = stripPeriod(item.title);
const newPrice = addDollarSignIfPriceIsNumber(item.price);
const newProductDescription = stripPeriod(item.product_description /* .replace(/\.\s+/g, '; ') */ );
const newFinePrint = formatFinePrint(item.fine_print);
const template = `${newTitle}; ${newPrice}; ${newProductDescription}${newFinePrint}. `;
return template.
replace(/BOGO/g, 'Buy one get one ').
replace(/\s*[†•]\s*/g, '; ').
replace(/;\s*;/g, ';'). /* If product_description is empty, this can happen. */
replace(/;\s*\./g, '.'). /* If price is empty, then there will be a dangling semicolon. */
replace(/\s+/g, ' ').
replace(/\s+\./g, '.').
replace(/\.\./g, '.').
replace(/\*/g, '').
replace(/\.;/g, ';').
//replace(/;\s*Fine print:\s*\./,'.').
replace(/-pc\./g, '-piece').
replace(/-pk\./g, '-pack').
replace(/-qt\./g, '-quart').
replace(/-ct\./g, '-count').
replace(/-oz\./g, '-ounce').
replace(/\/lb/g, '$ per pound').
replace(/\/mo/g, '$ per month').
replace(/-in\./g, '-inch').
replace(/-pt\./g, '-pint').
replace(/-fl\./g, '-fluid').
replace(/-lb\./g, '-pound').
replace(/([0-9]+)\/\$/g, "$1 for $").
/* The Azure TTS REST API chokes on the replaced characters below. */
replace(/\&/g, ' and ').
replace(/™/g, ' ');
}
function pluralize(itemNumber) {
return `${itemNumber} ${( itemNumber > 1 ) ? 'items' : 'item'}`;
}
function massagePageDescription(desc) {
return desc.
replace(/BEV/g, 'beverages').
replace(/SNC/g, 'novelty candies').
replace(/NIT/g, '').
split(',').
filter( (v, i, a) => a.indexOf(v) === i).
join('; ');
}
/* Excluding `promotion_message` because it is always included in the `product_description`.
*/
function parseFlyer(jsonString) {
return JSON.parse(jsonString).pages.
/* Filter pages that advertise Target services. */
filter( ( page ) => { return Number(page.indd_page_number) } ).
map(
( page ) =>
{
let newItems =
page.hotspots.
/* When there is only a `title` and `price` and `page_description` are missing, usually the item is just a page blurb. */
filter( ( item ) => { return [!!item.title, !!item.price, !!item.product_description].filter(i => i).length > 1 } ).
map(
( item ) =>
{
return {
tts_item_script: makeItemScript(item),
original_properties: {
title: item.title,
price: item.price,
product_description: item.product_description,
fine_print: item.fine_print,
/* promotion_message: item.promotion_message */
}
};
}
);
return {
items: newItems,
tts_page_script: `${massagePageDescription(page.page_description)}; ${pluralize(page.hotspots.length)} on this page. `,
original_properties: {
page_description: page.page_description,
indd_page_number: page.indd_page_number
}
};
}
); /* the above creates an object where each key is the original page number and it holds each items with the proposed texts to be read) */
}
const convertPageItemsToString =
( items ) =>
{
return items.
reduce(
(acc, item) =>
{ return acc + item.tts_item_script; },
""
);
};
parseFlyer(document.body.textContent).
map( (page, i) => `Page ${ i+1 }: ${ page.tts_page_script } ${ convertPageItemsToString(page.items) }`)
/*
The in-progress port of the above script to Node.js
*/
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
import * as readline from "readline";
import * as https from "https";
/* import * as http from "http"; */
import * as fs from "fs";
/* ============================================================================= */
function nullToEmptyString(maybeNull) {
return maybeNull ? maybeNull : '';
}
function stripPeriod(itemProperty) {
const notNullItemProperty = nullToEmptyString(itemProperty);
return notNullItemProperty.replace(/\.(\s|\*|†)*$/, '');
}
function addDollarSignIfPriceIsNumber(itemProperty) {
const notNullItemProperty = nullToEmptyString(itemProperty);
if (Number(notNullItemProperty)) {
return `$${notNullItemProperty}`;
} else {
return `${notNullItemProperty}`;
}
}
function formatFinePrint(finePrint) {
let newFinePrint =
finePrint.
replace(/(More|Less)\./g, ''). /* This is to be able to filter our 'Expect More. Pay Less.` on the "company messaging page */
replace(/No. /g, 'Product number: ').
replace(/\/mo\./g, ' per month').
split(/(\.\s+|\*|†)/).
filter( str => !str.match(/(©|reserved|trademark|property|countr)/i) ).
filter( str => str.length > 5 ).
filter( str => !str.match(/Target Circle/) ).
filter( str => !str.match(/help\.target\.com/g) ).
join('; ');
//if ( newFinePrint.length < 10 ) { console.log(newFinePrint.length); }
if (newFinePrint.length > 1000) {
return '';
} else {
return (newFinePrint.length === 0) ? '' : `; Fine print: ${newFinePrint}`;
}
}
function makeItemScript(item) {
const newTitle = stripPeriod(item.title);
const newPrice = addDollarSignIfPriceIsNumber(item.price);
const newProductDescription = stripPeriod(item.product_description /* .replace(/\.\s+/g, '; ') */ );
const newFinePrint = formatFinePrint(item.fine_print);
const template = `${newTitle}; ${newPrice}; ${newProductDescription}${newFinePrint}. `;
return template.
replace(/BOGO/g, 'Buy one get one ').
replace(/\s*[†•]\s*/g, '; ').
replace(/;\s*;/g, ';'). /* If product_description is empty, this can happen. */
replace(/;\s*\./g, '.'). /* If price is empty, then there will be a dangling semicolon. */
replace(/\s+/g, ' ').
replace(/\s+\./g, '.').
replace(/\.\./g, '.').
replace(/\*/g, '').
replace(/\.;/g, ';').
//replace(/;\s*Fine print:\s*\./,'.').
replace(/-pc\./g, '-piece').
replace(/-pk\./g, '-pack').
replace(/-qt\./g, '-quart').
replace(/-ct\./g, '-count').
replace(/-oz\./g, '-ounce').
replace(/\/lb/g, '$ per pound').
replace(/\/mo/g, '$ per month').
replace(/-in\./g, '-inch').
replace(/-pt\./g, '-pint').
replace(/-fl\./g, '-fluid').
replace(/-lb\./g, '-pound').
replace(/([0-9]+)\/\$/g, "$1 for $").
/* The Azure TTS REST API chokes on the replaced characters below. */
replace(/\&/g, ' and ').
replace(/™/g, ' ');
}
function pluralize(itemNumber) {
return `${itemNumber} ${( itemNumber > 1 ) ? 'items' : 'item'}`;
}
function massagePageDescription(desc) {
return desc.
replace(/BEV/g, 'beverages').
replace(/SNC/g, 'novelty candies').
replace(/NIT/g, '').
split(',').
filter( (v, i, a) => a.indexOf(v) === i).
join('; ');
}
/* Excluding `promotion_message` because it is always included in the `product_description`.
*/
function parseFlyer(jsonString) {
return JSON.parse(jsonString).pages.
/* Filter pages that advertise Target services. */
filter( ( page ) => { return Number(page.indd_page_number) } ).
map(
( page ) =>
{
let newItems =
page.hotspots.
/* When there is only a `title` and `price` and `page_description` are missing, usually the item is just a page blurb. */
filter( ( item ) => { return [!!item.title, !!item.price, !!item.product_description].filter(i => i).length > 1 } ).
map(
( item ) =>
{
return {
tts_item_script: makeItemScript(item),
original_properties: {
title: item.title,
price: item.price,
product_description: item.product_description,
fine_print: item.fine_print,
/* promotion_message: item.promotion_message */
}
};
}
);
return {
items: newItems,
tts_page_script: `${massagePageDescription(page.page_description)}; ${pluralize(page.hotspots.length)} on this page. `,
original_properties: {
page_description: page.page_description,
indd_page_number: page.indd_page_number
}
};
}
); /* the above creates an object where each key is the original page number and it holds each items with the proposed texts to be read) */
}
// let ff = Object.keys(f).reduce( (accArray, page_no) => { return accArray.concat(f[page_no])}, [] );
/* f.map( (page, i) => `Page ${i+1}: ${page.tts_page_script} ${ page.items.reduce( (acc, item) => { return acc + item.tts_item_script},"") }`. */
/* ). */
/* forEach( e => console.log(e)); */
//Object.keys(f);
//filter( pageArray => pageArray.filter( item => item.product_description.match(/wiffer/) ).length > 0 )
//map( pageArray => pageArray.filter( item => !(!!item.price) ) )
//reduce( (acc, pageArray) => { return acc.concat(pageArray.map( item => { console.log(item.tts_item_script); return item.tts_item_script} ))}, []).filter( str => str.match(/brief/i) )//.forEach( i => console.log(i) )
/* To print the text to be read (it would have probably been easier to `map` tts_item_script + join('') */
/*
reduce(
(acc, pageArray) => {
return acc + pageArray.reduce( (acc, item) => { return acc + item.tts_item_script }, "")
},
""
)
*/
/* ============================================================================= */
(function() {
"use strict";
/* The `import`s would not work inside the body of the function for some reason, so had to move them to the top when replacing the `require`s
*/
/* var sdk = require("microsoft-cognitiveservices-speech-sdk"); */
/* var readline = require("readline"); */
/* http.request('http://localhost:4000', (res) => { */
const url = 'https://api.target.com/weekly_ads/v1/promotions/3306-20221218?key=9ba599525edd204c560a2182ae1cbfaa3eeddca5';
https.get(
url,
(res) =>
{
res.setEncoding('utf-8');
let rawData = '';
res.on(
'data',
(chunk) =>
{
rawData += chunk;
// https://stackoverflow.com/a/11267583/1498178
// (Do note the caveat at the bottom; so if this is something way larger, probably `createWriteStream` is the way to go:
// https://stackoverflow.com/a/43370201/1498178
/* fs.appendFile('target_2022-12-18.json', chunk, (err) => { */
/* if (err) { throw err; } */
/* console.log(`This chunk (length: ${chunk.length}) is written.`); */
/* }); */
/* console.log(chunk); */
/* console.log(process.cwd()); */
/* console.log(typeof chunk); */
}
);
res.on(
'end',
() =>
{
/* const str = parseFlyer(rawData)[0].tts_page_script; */
const str = parseFlyer(rawData).map( (page, i) => `Page ${i+1}: ${page.tts_page_script} ${ page.items.reduce( (acc, item) => { return acc + item.tts_item_script},"") }`)[27];
console.log(str);
/* console.log(JSON.parse(rawData).pages[0].page_description); */
}
);
}
);
/*
var audioFile = "YourAudioFile.wav";
// This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
const speechConfig = sdk.SpeechConfig.fromSubscription(process.env.SPEECH_KEY, process.env.SPEECH_REGION);
const audioConfig = sdk.AudioConfig.fromAudioFileOutput(audioFile);
// The language of the voice that speaks.
speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";
// Create the speech synthesizer.
var synthesizer = new sdk.SpeechSynthesizer(speechConfig, audioConfig);
var rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
rl.question("Enter some text that you want to speak >\n> ", function (text) {
rl.close();
// Start the synthesizer and wait for a result.
synthesizer.speakTextAsync(text,
function (result) {
if (result.reason === sdk.ResultReason.SynthesizingAudioCompleted) {
console.log("synthesis finished.");
} else {
console.error("Speech synthesis canceled, " + result.errorDetails +
"\nDid you set the speech resource key and region values?");
}
synthesizer.close();
synthesizer = null;
},
function (err) {
console.trace("err - " + err);
synthesizer.close();
synthesizer = null;
});
console.log("Now synthesizing to: " + audioFile);
});
*/
}());
@toraritte
Copy link
Author

Steps:

  1. Go to https://api.target.com/weekly_ads/v1/promotions/3306-20221218?key=9ba599525edd204c560a2182ae1cbfaa3eeddca5
    (20221218 can be any other date that falls on a Sunday - as of the time of this writing)

  2. Open the browser's dev tools and go the JavaScript console.

  3. Use the script above.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment