Skip to content

Instantly share code, notes, and snippets.

@swsalim
Last active April 29, 2024 02:55
Show Gist options
  • Save swsalim/664937980b4333300def43c99207cc3e to your computer and use it in GitHub Desktop.
Save swsalim/664937980b4333300def43c99207cc3e to your computer and use it in GitHub Desktop.
Clean APIFY Data
import * as fs from 'fs';
import * as path from 'path';
// Define a type alias for the days of the week
type Days = {
monday: string;
tuesday: string;
wednesday: string;
thursday: string;
friday: string;
saturday: string;
sunday: string;
};
// Use the type alias when declaring the constant
const days: Days = {
monday: 'Mo',
tuesday: 'Tu',
wednesday: 'We',
thursday: 'Th',
friday: 'Fr',
saturday: 'Sa',
sunday: 'Su',
};
const slugify = (str: string) => {
// remove accents, swap ñ for n, etc
const from = 'àáãäâèéëêìíïîòóöôùúüûñç·/_,:;';
const to = 'aaaaaeeeeiiiioooouuuunc------';
const slug = str.split('').map((letter, i) => {
return letter.replace(new RegExp(from.charAt(i), 'g'), to.charAt(i));
});
return (
// Replace multiple - with single -
slug
.toString() // Cast to string
.toLowerCase() // Convert the string to lowercase letters
.trim() // Remove whitespace from both sides of a string
.replace(/\s+/g, '-') // Replace spaces with -
.replace(/\/+/g, '-') // Replace / with -
.replace(/&/g, '-and-') // Replace & with 'and'
// eslint-disable-next-line no-useless-escape
.replace(/[^\w\-]+/g, '') // Remove all non-word chars
// eslint-disable-next-line no-useless-escape
.replace(/\-\-+/g, '-')
);
};
function processData(jsonData: any): any {
const result: any[] = [];
for (let i = 0; i < jsonData.length; i++) {
const place = jsonData[i];
if (place.openingHours.length === 0) {
continue;
}
if ((place.website ?? '').includes('healthhub')) {
place.website = '';
}
place.latLng = `${place.location.lat}, ${place.location.lng}`;
const keysToDelete = [
'popularTimesHistogram',
'popularTimesLiveText',
'popularTimesLivePercent',
'peopleAlsoSearch',
'additionalInfo',
'location',
'address',
'subTitle',
'description',
'menu',
'categoryName',
'neighborhood',
'locatedIn',
'plusCode',
'placeId',
'categories',
'cid',
'imageCategories',
'searchPageUrl',
'searchPageLoadedUrl',
'searchString',
'scrapedAt',
'imagesCount',
'webResults',
'orderBy',
'reviewsTags',
'questionsAndAnswers',
'updatesFromCustomers',
'reserveTableUrl',
'googleFoodUrl',
'hotelStars',
'rank',
'claimThisBusiness',
'hotelDescription',
'checkInDate',
'checkOutDate',
'similarHotelsNearby',
'hotelReviewSummary',
'hotelAds',
'placesTags',
'gasPrices',
];
keysToDelete.forEach((key) => {
delete place[key];
});
place.slug = slugify(place.title);
result.push(place);
}
return result;
}
function processFile(inputFileName: string) {
const inputFileFullPath = path.resolve(inputFileName);
const fileContent = fs.readFileSync(inputFileFullPath, 'utf-8');
const jsonData = JSON.parse(fileContent);
const processedData = processData(jsonData);
// Define the parsedData directory path
const parsedDataDir = path.resolve('./tasks/data/parsedData');
// Ensure the parsedData directory exists
if (!fs.existsSync(parsedDataDir)) {
fs.mkdirSync(parsedDataDir, { recursive: true });
}
// Prepend 'processed-' to the original file name to create the output file name
const outputFileName = `processed-${path.basename(inputFileName)}`;
// Update the outputFileFullPath to include the parsedData directory
const outputFileFullPath = path.join(parsedDataDir, outputFileName);
// Save the processed data to a new JSON file
fs.writeFileSync(outputFileFullPath, JSON.stringify(processedData, null, 2), 'utf-8');
console.log(`Processed file saved as: ${outputFileFullPath}`);
}
const command = process.argv[2];
if (!command) {
console.error('Please provide a file name.');
process.exit(1);
}
const filename = `./tasks/data/${command}`;
processFile(filename);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment