Skip to content

Instantly share code, notes, and snippets.

@TotallyInformation
Last active September 7, 2019 17:02
Show Gist options
  • Save TotallyInformation/bd2cefe5cf8f36233b3b736deb256788 to your computer and use it in GitHub Desktop.
Save TotallyInformation/bd2cefe5cf8f36233b3b736deb256788 to your computer and use it in GitHub Desktop.
Extract data from a complex HTML table

While Node-RED has some nodes for extracting data from HTML, the nodes are rather simplistic.

To do more complex processing, you can use the cheerio npm module directly. This module is used by the existing nodes but not all of the options are exposed in order to keep the node more easily comprehensible.

I've already published this info on my dev website so I won't repeat everything here. You can find the details at:

https://it.knightnet.org.uk/kb/nr-qa/extract-html-table.html

[{"id":"815bf28f.d0691","type":"inject","z":"e7463dd2.db517","name":"get data","topic":"","payload":"","payloadType":"date","repeat":"","crontab":"","once":false,"x":120,"y":100,"wires":[["46f8420f.a312dc"]]},{"id":"46f8420f.a312dc","type":"http request","z":"e7463dd2.db517","name":"","method":"GET","ret":"txt","url":"http://publicinfobanjir.water.gov.my/View/OnlineFloodInfo/PublicWaterLevel.aspx?scode=SEL","tls":"","x":330,"y":100,"wires":[["808d9c98.e305d"]]},{"id":"fe447848.e25938","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":810,"y":60,"wires":[]},{"id":"d0660df5.4f339","type":"function","z":"e7463dd2.db517","name":"","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n\nconst tableSelector = '#ContentPlaceHolder1_grdStation'\n\nconst cheerio = global.get('cheerio')\nconst $ = cheerio.load(msg.payload)\nconst options = {\n rowForHeadings: 0, // extract th cells from this row for column headings (zero-based)\n ignoreHeadingRow: true, // Don't tread the heading row as data\n ignoreRows: [],\n}\nconst jsonReponse = []\nconst columnHeadings = []\n\n$(tableSelector).each(function(i, table) {\n var trs = $(table).find('tr')\n \n // Set up the column heading names\n getColHeadings( $(trs[options.rowForHeadings]) )\n\n // Process rows for data\n $(table).find('tr').each(processRow)\n})\n\nmsg.payload = {\n columnHeadings: columnHeadings,\n rows: jsonReponse,\n}\n\nreturn msg\n\nfunction getColHeadings(headingRow) {\n const alreadySeen = {}\n \n $(headingRow).find('th').each(function(j, cell) {\n let tr = $(cell).text().trim()\n \n if ( alreadySeen[tr] ) {\n let suffix = ++alreadySeen[tr]\n tr = `${tr}_${suffix}`\n } else {\n alreadySeen[tr] = 1\n }\n \n columnHeadings.push(tr)\n })\n}\n\nfunction processRow(i, row) {\n const rowJson = {}\n \n if ( options.ignoreHeadingRow && i === options.rowForHeadings ) return\n // TODO: Process options.ignoreRows\n \n $(row).find('td').each(function(j, cell) {\n rowJson[ columnHeadings[j] ] = $(cell).text().trim()\n })\n \n // Skip blank rows\n if (JSON.stringify(rowJson) !== '{}') jsonReponse.push(rowJson)\n}\n\n//EOF","outputs":"1","noerr":0,"x":650,"y":100,"wires":[["fe447848.e25938","c35dcf95.5cc9","641996d3.fe8118"]]},{"id":"641996d3.fe8118","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"selected","pt":"msg","to":"{}","tot":"json"},{"t":"set","p":"selected.station","pt":"msg","to":"payload.rows[40]['Station ID']","tot":"msg"},{"t":"set","p":"selected.level","pt":"msg","to":"payload.rows[40]['River Level (m)']","tot":"msg"},{"t":"move","p":"selected","pt":"msg","to":"payload","tot":"msg"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":100,"wires":[["f0152260.70e1d"]]},{"id":"f0152260.70e1d","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":100,"wires":[]},{"id":"c35dcf95.5cc9","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"payload","pt":"msg","to":"$$.payload.rows.(\t {\t \"station\": $.'Station ID',\t \"name\": $.'Station Name',\t \"level\": $.'River Level (m)',\t \"alert\": $number($.'River Level (m)') > $number($.Alert) ? true : false\t }\t)","tot":"jsonata"}],"action":"","property":"","from":"","to":"","reg":false,"x":820,"y":140,"wires":[["d505285a.26bcb8"]]},{"id":"d505285a.26bcb8","type":"debug","z":"e7463dd2.db517","name":"","active":true,"complete":false,"x":990,"y":140,"wires":[]},{"id":"808d9c98.e305d","type":"function","z":"e7463dd2.db517","name":"cache","func":"/*jshint sub:true,asi:true,maxerr:1000*/\n// Expects input msgs with topic set \n\nconst contextVarName = 'httpReqMsgs' // homeMsgs\n\n// saved context\nvar cachedMsgs = context.get(contextVarName) || {}\n\n// Only send to single client if needed\nvar socketId = null\nif ( msg.hasOwnProperty('_socketId') ) {\n socketId = msg._socketId\n}\n\n// Replay cache if requested\nif ( msg.hasOwnProperty('cacheControl') && msg.cacheControl.toUpperCase() === 'REPLAY' ) {\n for (var topic in cachedMsgs) {\n let newMsg = {\n \"topic\": topic, \n \"payload\": cachedMsgs[topic]\n }\n // Only send to a single client if we can\n if ( socketId !== null ) newMsg._socketId = socketId\n node.send(newMsg)\n }\n return null\n}\n// -- else if --\n// Empty cache if requested\nif ( (msg.hasOwnProperty('cacheControl') && msg.cacheControl === 'RESET') ||\n (msg.payload.hasOwnProperty('cacheControl') && msg.payload.cacheControl === 'RESET') ) {\n cachedMsgs = {}\n context.set(contextVarName, cachedMsgs)\n return null\n}\n// -- else --\n\n// ignore cacheControl and uibuilder control messages\nif ( msg.hasOwnProperty('cacheControl') || msg.hasOwnProperty('uibuilderCtrl') ) return null\n\n// Add a counter for each device name\nif ( msg.topic.endsWith('$name') ) {\n let topic = msg.topic.replace('$name', '$count')\n let count = cachedMsgs[topic] || 0\n count = count + 1\n cachedMsgs[topic] = count\n let newMsg = {\n \"topic\": topic, \n \"payload\": count\n }\n // Only send to a single client if we can\n if ( socketId !== null ) newMsg._socketId = socketId\n node.send(newMsg)\n}\n\n// Keep the last msg.payload by topic\ncachedMsgs[msg.topic] = msg.payload\n\n// save context for next time\ncontext.set(contextVarName, cachedMsgs)\n\nreturn msg;","outputs":1,"noerr":0,"x":510,"y":100,"wires":[["d0660df5.4f339","c0cacccb.9e3c7"]]},{"id":"afc218de.706f88","type":"inject","z":"e7463dd2.db517","name":"replay","topic":"","payload":"","payloadType":"str","repeat":"","crontab":"","once":false,"x":110,"y":160,"wires":[["eddcc57e.277418"]]},{"id":"c0cacccb.9e3c7","type":"debug","z":"e7463dd2.db517","name":"","active":false,"complete":"true","x":650,"y":180,"wires":[]},{"id":"eddcc57e.277418","type":"change","z":"e7463dd2.db517","name":"","rules":[{"t":"set","p":"cacheControl","pt":"msg","to":"REPLAY","tot":"str"}],"action":"","property":"","from":"","to":"","reg":false,"x":300,"y":160,"wires":[["808d9c98.e305d"]]}]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment