Skip to content

Instantly share code, notes, and snippets.

@MrOrz
Created May 7, 2021 16:07
Show Gist options
  • Save MrOrz/31c881733fc9b9e9aa491940f8b02691 to your computer and use it in GitHub Desktop.
Save MrOrz/31c881733fc9b9e9aa491940f8b02691 to your computer and use it in GitHub Desktop.
165 news crawler
/**
* @OnlyCurrentDoc
*/
const DATA_SHEET = '165 民眾通報假投資/博弈詐騙網站';
function getLastDate() {
const sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(DATA_SHEET);
const lastRow = sheet.getLastRow();
return new Date(sheet.getRange(lastRow, 6).getValue() || 0);
}
function fetchArticles(fromDate) {
const resp = UrlFetchApp.fetch('https://165.npa.gov.tw/api/article/list/news', {method: 'get'});
return JSON.parse(resp.getContentText())
.filter(article => article.title.includes('民眾通報假投資') && new Date(article.publishDate) > fromDate)
.sort((a1, a2) => new Date(a1.publishDate) - new Date(a2.publishDate));
}
function fetchArticle(articleId) {
const resp = UrlFetchApp.fetch(`https://165.npa.gov.tw/api/article/detail/news/${articleId}`, {method: 'get'});
return JSON.parse(resp.getContentText());
}
function getTableDataFromHTML(htmlString) {
// Sanitize &XXX; that will break XML parsing
const sanitizedHtml = `<body>${htmlString.replace(/&.+?;/g, ' ')}</body>`;
const document = XmlService.parse(sanitizedHtml);
return document.getDescendants().reduce((tableData, descendant) => {
if(descendant.getType() !== XmlService.ContentTypes.ELEMENT) return tableData;
const element = descendant.asElement();
if(element.getName() !== 'tr') return tableData;
const [name, url, count] = element.getAllContent().filter(content => {
if(content.getType() !== XmlService.ContentTypes.ELEMENT) return false;
const elem = content.asElement();
return elem.getName() === 'td';
}).map(td => td.getValue());
return [...tableData, { name, url, count: +count }];
}, []).slice(1); // Skip header
}
function appendTableData(article, tableData) {
const sheet = SpreadsheetApp.getActiveSpreadsheet().getSheetByName(DATA_SHEET);
const lastRow = sheet.getLastRow();
sheet.insertRowsAfter(lastRow, tableData.length);
const range = sheet.getRange(lastRow + 1, 1, tableData.length, 5);
range.setValues(tableData.map(({name, url, count}) => [article.id, name, url, count, article.publishDate]));
}
function main() {
const lastDate = getLastDate();
const articles = fetchArticles(lastDate);
console.log(`Fetched ${articles.length} new article(s):`, articles.map(({id, title}) => `#${id}: ${title}`));
articles.forEach(a => {
console.log(`Processing article #${a.id}`);
const article = fetchArticle(a.id);
const tableData = getTableDataFromHTML(article.content);
appendTableData(a, tableData);
});
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment