Last active
August 1, 2022 21:50
-
-
Save nickumia-reisys/e1ae99322e9d3f04b12ec4d83b86cccb to your computer and use it in GitHub Desktop.
Figure out what webpages are duplicates in data.gov website migration
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
var TurndownService = require('turndown') | |
const fs = require('fs'); | |
fs.readFile(process.argv[2], 'utf8', (err, data) => { | |
if (err) { | |
console.error(err); | |
return; | |
} | |
var turndownService = new TurndownService() | |
turndownService.remove('form') | |
turndownService.remove('comments') | |
turndownService.remove('script') | |
turndownService.remove('style') | |
turndownService.remove('footer') | |
turndownService.remove('section') | |
var markdown = turndownService.turndown(data) | |
// console.log(markdown); | |
fs.writeFile(process.argv[3], markdown, 'utf8', function () {}); | |
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
import sys | |
all_pages = {} | |
page_to_check = sys.argv[1] | |
with open('WordPress migration - Sheet1.csv', newline='') as csvfile: | |
pages = csv.reader(csvfile, delimiter=',', quotechar='|') | |
for row in pages: | |
paths = row[0].split('/') | |
identifier = None | |
if '@' in paths[-1]: | |
identifier = paths[-1].split('@')[0] | |
else: | |
identifier = paths[-1] | |
if identifier == 'index.html': | |
identifier = paths[-2] + '.html' | |
if identifier in all_pages: | |
all_pages[identifier] += [row[0]] | |
else: | |
all_pages[identifier] = [row[0]] | |
# print(all_pages) | |
print(len(all_pages)) | |
# print(len(all_pages)) | |
for p in all_pages[page_to_check]: | |
print(p) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment