Instantly share code, notes, and snippets.

Embed
What would you like to do?
<?php
/*
Export blog URLs from google spreadsheet CSV into a single column csv file.
Duplicate URLs are removed.
Created: December 06, 2018
Modifed: December 07, 2018
*/
// Libraries loaded via composer
require '../vendor/autoload.php';
// use "composer require league/csv" to install CSV (https://csv.thephpleague.com/)
use League\Csv\Reader;
use League\Csv\Writer;
// Path to CSV file downloaded from https://docs.google.com/spreadsheets/d/1_PufzMGKqW25QjLYX2A3EiABqxPl7yRsm56YhkHAO1s/edit#gid=1749282211
$input_spreadsheet = 'D:\Archiving stuff\Archive Team\Tumblr NSFW project (Dec 2018)\Form responses\responses-2018-12-07-140119.csv';
// Path to save unique URL list
$output_spreadsheet = 'form-urls-' . date('Y-m-d-His') . '.csv';
// .tumblr.com Regex
$url_regex = '/(?:https?:\/\/)?(?:www.)?((?:[\w-]+)\.tumblr\.com)\/?/';
// Custom domain regex
$custom_url_regex = '/^(?:https?:\/\/)?(?:www\.)?([a-zA-Z0-9-.]{2,256}\.[a-z]{2,20})\/?$/';
// Unique *.tumblr.com URLs.
$urls = array();
// Unique custom domain URLs.
$custom_urls = array();
// Text and URLs that failed the regex.
$other = array();
$row_count = 0;
$total_urls = 0;
$unique_urls = 0;
$duplicate_urls = 0;
$other_count = 0;
try {
// Load CSV file
$reader = Reader::createFromPath($input_spreadsheet, 'r');
// Set header offset as first row
$reader->setHeaderOffset(0);
// Get Records and set header values
$records = $reader->getRecords(['date', 'url_1', 'url_2', 'url_3', 'url_4', 'url_5', 'url_6', 'url_7', 'url_8', 'url_9', 'url_10', 'note']);
// Loop through each item from the csv file
foreach ($records as $index => $row) {
output('Row: ' . $index);
output('Date: ' . $row['date']);
output('URLs: ');
foreach ($row as $value_index => $value) {
if ($value_index === 'date' || $value_index === 'note') { // Skip date & note values
continue;
}
$value = trim($value);
if (empty($value)) { // Skip empty
continue;
}
if (preg_match($url_regex, $value, $match)) { // .tumblr.com
$total_urls++;
$url = "http://" . strtolower($match[1]);
if (in_array($url, $urls) === false) { // Unique url
$unique_urls++;
output(" " . $url);
$urls[] = $url;
} else { // Duplicate url
$duplicate_urls++;
output(" " . $url . " (Duplicate)");
}
} else if (preg_match($custom_url_regex, $value, $match)) { // Custom domain
$total_urls++;
$url = "http://" . strtolower($match[1]);
if (in_array($url, $urls) === false) { // Unique url
$unique_urls++;
output(" " . $url);
$custom_urls[] = $url;
} else { // Duplicate url
$duplicate_urls++;
output(" " . $url . " (Duplicate)");
}
} else { // Failed URL regex
$other[] = $value;
$other_count++;
}
}
output();
$row_count++;
}
output('Custom Domains:');
foreach ($custom_urls as $value) {
output(" " . $value);
}
output();
output('Other (failed URL regex):');
foreach ($other as $value) {
output(" " . $value);
}
output();
output();
output('Total Rows: ' . number_format($row_count));
output('Total URLs: ' . number_format($total_urls));
output('Unique URLs: ' . number_format($unique_urls));
output('Duplicate URLs: ' . number_format($duplicate_urls));
output('Other: ' . number_format($other_count));
try {
$writer = Writer::createFromPath($output_spreadsheet, 'w+');
// Insert header
$writer->insertOne(['url']);
// Insert .tumblr.com rows
foreach ($urls as $url) {
$writer->insertOne([$url]);
}
// Insert custom domain rows
foreach ($custom_urls as $url) {
$writer->insertOne([$url]);
}
output();
output("Created file: " . $output_spreadsheet);
} catch (CannotInsertRecord $e) {
output("Failed insert record: " . $e->getRecords());
output("File not created");
}
} catch (Exception $e) {
echo 'Caught exception: ', $e->getMessage(), "\n";
}
function output (string $text = '') {
echo $text . " \n";
}
@VerifiedJoseph

This comment has been minimized.

Owner

VerifiedJoseph commented Dec 6, 2018

The csv file this script creates can be viewed here: https://verifiedjoseph.com/archiveteam/tumblr-form-urls/

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment