Skip to content

Instantly share code, notes, and snippets.

@misner
Created October 4, 2021 09:30
Show Gist options
  • Save misner/a37428b437113741c244e2edf4cb51ef to your computer and use it in GitHub Desktop.
Save misner/a37428b437113741c244e2edf4cb51ef to your computer and use it in GitHub Desktop.
//current code
import {
ErrorHandlingHOF,
log,
stringExtractedIsInValid,
organicExtractData_LogIfValueIsInvalid } from "./utils/index.js";
import {
slugifyStr,
getFirstNWordsInStr,
longestWord } from "./utils/stringManipulations.js";
import {
COUNTRY_CODE_ISO3166_MAPPING } from "./utils/countryNameToCountryIso3166Code.js";
import {
addWebsiteNameTrackingParams,
getLwnRootDomain } from "./utils/urlManipulations.js";
import { WEBSITE_NAME_INCLUDING_DEV_MODE } from "./utils/fnConstants.js";
/* Database
*/
//import DB from "baqend";
var DB = require("baqend");
const DB_NAME = 'Job';
let connecting;
const connect_to_baqend = (app) => {
if (typeof connecting === 'object') {
log("on this lambda execution, the baqend database is already connected thanks to shared-memory ...")
return connecting;
}
connecting = DB.connect(app, true).then(() => {
console.log("Baqend Connected. Awaiting login..");
//DB.__baqend_connection = true; //using this variable to avoid "user is already logged in" issue from baqend
//using promise to avoid "user is already logged in" issue from baqend
const loginIfNecessaryPromise = !DB.User.me
? DB.User.login(
"aws-to-baqend",
process.env.AWS_TO_BAQEND
)
: Promise.resolve();
return loginIfNecessaryPromise
.then(function() {
// Work!
console.log('Logged into Baqend. Connection is ready to be used...')
})
.catch(function(loginIfNecessaryError) {
console.log("loginIfNecessaryPromise", loginIfNecessaryPromise);
});
});
return connecting;
};
const handler = ErrorHandlingHOF(async function(event, context, callback) {
//console.log('>>>>>', DB_NAME, DB[DB_NAME]);
const {
position,
link,
pubDate,
companyName,
jobLocationCity,
base_url,
extract_data_type,
base_url_country,
jkNumber,
description,
cleanFinalApplyDestinationUrl,
tagsMatched,
companyCertainDomain,
companyClearbitAutocompApproxDomain,
companyCertainLogoUrl,
companyClearbitAutocomppeApproxLogoUrl
} = event
log("Got", {
position,
link,
pubDate,
companyName,
jobLocationCity,
base_url,
extract_data_type,
base_url_country,
jkNumber,
description,
companyCertainDomain,
cleanFinalApplyDestinationUrl,
tagsMatched,
companyClearbitAutocompApproxDomain,
companyClearbitAutocomppeApproxLogoUrl
});
/* Standardization & create the inputs for the database
Note: keep same order as columns in the database for easier work on db<->code
*/
/************* value for listing_id *************************/
//not performed on Lambda level but on db level (on Baqend performed by "Modules")
//only solution to avoid concurrency issues and allow the "last mile"=the db
//to make sure it never sets the same value for 2 jobs
/************* standardize extract_data_type *************************/
const jobDataSourceType = extract_data_type;
//check is string and not empty
if ( stringExtractedIsInValid(jobDataSourceType) ) {
organicExtractData_LogIfValueIsInvalid("extracttion data type");
return;
}
//check is authorized value
let possibleDataExtractionType = ['scraping', 'api'];
//put in utils
function isAuthorizedValue(text, acceptedValues) {
return acceptedValues.indexOf(text) !== -1;
}
if (!isAuthorizedValue(jobDataSourceType, possibleDataExtractionType)) {
organicExtractData_LogIfValueIsInvalid("extracttion data type");
return;
}
log("jobDataSourceType is : " + jobDataSourceType);
/************* standardize listing_master_scraping_query *************************/
const jobMasterScrapingQuery = base_url;
//check is string and not empty
if ( stringExtractedIsInValid(jobMasterScrapingQuery) ) {
organicExtractData_LogIfValueIsInvalid("jobMasterScrapingQuery");
return;
}
log("jobMasterScrapingQuery is : " + jobMasterScrapingQuery);
/************* standardize listing_data_source_url *************************/
const jobDataSourceUrl = link;
//check is string and not empty
if ( stringExtractedIsInValid(jobDataSourceUrl) ) {
organicExtractData_LogIfValueIsInvalid("job details url on source website");
return;
}
log("jobDataSourceUrl is : " + jobDataSourceUrl);
/************* standardize listing_title *************************/
let jobPosition = position;
//check is string and not empty
if ( stringExtractedIsInValid(jobPosition) ) {
organicExtractData_LogIfValueIsInvalid("job details url on source website");
return;
}
//fix issue #558
//where huge string without any white space was breaking feed rendering
const longestWordInPosition = longestWord(jobPosition);
if ( longestWordInPosition.length > 30 ) {
jobPosition = jobPosition.replace(longestWordInPosition, longestWordInPosition.replace(/\//g, ' / '));//replace/ by ' / '
}
//remove neutral words present when they indicate job position includes
//company name or cities
//remove everything after detection of the word
//not super modular, but we put here all the keywors in all languages
var excludeCity = new RegExp("basé à|based in", "gi");//case insensitive match
jobPosition = jobPosition.split(excludeCity)[0];
let normalizedJobPosition = jobPosition
//.toLowerCase(); not required as would put some acronyms like "IT" into "it"
.trim() //remove any white space at the beginnign or the end of the string
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space
let jobPositionTest = getFirstNWordsInStr(normalizedJobPosition, 3);
if ( jobPositionTest == jobPositionTest.toUpperCase() ) {
//if true means the whole string is 100% uppercase in the first 3 words
//ex: https://www.indeed.fr/voir-emploi?jk=f5472bfe2b5a5f00 => "COMMERCIAL H/F"
//in that case, to enforce consistent formatting on critical listing feed between different job's positions format
//lowercase the string (with one loophole where inside the lower-cased string, you have a word
//which should stay uppercase such as IT, which will become "it" and then via css capitalize It...
//but it's the less bad option
//Note: why not check the whole string and only 3 words:
//This was leaving "BUSINESS DEVELOPMENT INTERN PARIS (M/F/D) GmbH" unhandled by above condition
normalizedJobPosition = normalizedJobPosition.toLowerCase();
}
log("normalizedJobPosition is : " + normalizedJobPosition);
/************* set value for listing_location_restricted *************************/
const jobLocationRestricted = false;
/************* standardize base_url_country *************************/
let jobLocationCountry = base_url_country;
//check is string and not empty
if ( stringExtractedIsInValid(jobLocationCountry) ) {
organicExtractData_LogIfValueIsInvalid("base_url_country");
return;
}
let normalizedjobLocationCountry = jobLocationCountry
.toLowerCase()
.trim() //remove any white space at the beginnign or the end of the string
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space
log("normalizedjobLocationCountry is : " + normalizedjobLocationCountry);
/************* set value for listing_location_country_iso3166 *************************/
let countryToCountryCodeIso3166 = COUNTRY_CODE_ISO3166_MAPPING;
const jobLocationCountryIso3166 = countryToCountryCodeIso3166[normalizedjobLocationCountry];
if (typeof jobLocationCountryIso3166 === 'undefined') {
log("there was a problem : we couldn't find the iso 3166 country code for " +
base_url_country + "inside functions-src/utils/countryNameToCountryIso3166Code.js");
return;
} else {
log("jobLocationCountryIso3166 is : " + jobLocationCountryIso3166);
}
/************* standardize listing_location_city *************************/
//check is string and not empty
if ( stringExtractedIsInValid(jobLocationCity) ) {
organicExtractData_LogIfValueIsInvalid("jobLocationCity");
return;
}
let normalizedjobLocationCity = jobLocationCity
.toLowerCase()
.trim() //remove any white space at the beginnign or the end of the string
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space
log("normalizedjobLocationCity is : " + normalizedjobLocationCity);
/************* set the value of listing_location_city_slug *************************/
let jobLocationCitySlug = slugifyStr(normalizedjobLocationCity);
log("jobLocationCitySlug is : " + jobLocationCitySlug);
/************* standardize tagsMatched *************************/
let jobsTagArr = tagsMatched;
let normalizedJobsTagArr = jobsTagArr.filter(function(s) {
return !stringExtractedIsInValid(s);
}).map(function(s) {
return s
.toLowerCase()
.trim() //remove any white space at the beginningg or the end of the string
.replace(/\s+/g, ' '); //remove any multiple whites spaces into only one white space
});
if (normalizedJobsTagArr.length < jobsTagArr.length) {
console.log('This is invalid: ', jobsTagArr);
}
log("normalizedJobsTagArr is:");
log(normalizedJobsTagArr);
/************* set the value of listing_tags_slug *************************/
let jobTagsSlugArr = normalizedJobsTagArr.map(function(s) {
return slugifyStr(s)
});
log("jobTagsSlugArr is:");
log(jobTagsSlugArr);
/************* set the value of listing_description *************************/
let jobDescription = description;
//remove any occurence of the word "null"
//fixes issue #560
jobDescription = jobDescription.replace(/ null /g, ' ');
log("jobDescription is : " + jobDescription);
/************* standardize cleanFinalApplyDestinationUrl *************************/
let jobApplyUrl = cleanFinalApplyDestinationUrl;
//check is string and not empty
if ( stringExtractedIsInValid(jobApplyUrl) ) {
organicExtractData_LogIfValueIsInvalid("final destination url");
return;
}
//note on process.env.url below
//theoretically, we should have used a conditional assignment like we do on /sitemaps-gen.js
//but here as we'll click often on links even while working,
//we'd rather refer trafic and have destination website know it's a clean for example
//aijobs.tech rather than a https://deploy-preview-296--zen-colden-2b17b5.netlify.com
//which is referring traffic to them.
const targetWebsiteRootUrl = getLwnRootDomain(process.env.URL);
const normalizedJobApplyUrl = addWebsiteNameTrackingParams(jobApplyUrl, targetWebsiteRootUrl);
log("normalizedJobApplyUrl is : " + normalizedJobApplyUrl);
/************* standardize content_entity_name *************************/
//check is string and not empty
if ( stringExtractedIsInValid(companyName) ) {
organicExtractData_LogIfValueIsInvalid("companyName");
return;
}
let normalizedCompanyName = companyName
.toLowerCase()
.trim() //remove any white space at the beginnign or the end of the string
.replace(/\s+/g, ' ');//remove any multiple whites spaces into only one white space
log("normalizedCompanyName is : " + normalizedCompanyName);
/************* set the value of content_entity_name_slug *************************/
let companyNameSlug = slugifyStr(normalizedCompanyName);
log("companyNameSlug is : " + companyNameSlug);
/************* set the value of content_entity_initials *************************/
//note: using companyNameSlug instead of normalizedCompanyName
//for multiple reasons, but especially don't want accents on initials
//special characters might mess our script to get initials and apostrophes
//like on l'oreal would make initial as l only (or would reuire js heavy lifting)
//So in a way it's great we can leverage at this point the fact we "rationalized"
//stuff with companyNameSlug
let companyInitials = companyNameSlug
.split("-") //extract words seperated by the -
.map((n)=>n[0]) //take the first letter of each word
.join("")
.slice(0,2); //limit to the first 2 initial letters
log("companyInitials is : " + companyInitials);
/************* set value for content_entity_domain *************************/
let companyDomain;
if (companyCertainDomain) {
companyDomain = companyCertainDomain;
} else if ( !stringExtractedIsInValid(companyClearbitAutocompApproxDomain) ) {//best option after companyCertainDomain
companyDomain = companyClearbitAutocompApproxDomain;
//note: no need to sanitize/normalize as brought by Clearbit API which is already
//cleaning things any wonky cases for us
} else {
companyDomain = '';
log("unfortunately we could not find the company domain for this job post (no biggie)");
}
log("companyDomain is : " + companyDomain);
/************* set value for content_entity_logo_url *************************/
let companyLogoUrl;
if (companyCertainLogoUrl) {
companyLogoUrl = companyCertainLogoUrl;
} else if (companyClearbitAutocomppeApproxLogoUrl) {//best option after companyCertainLogoUrl
companyLogoUrl = companyClearbitAutocomppeApproxLogoUrl;
//note: no need to sanitize/normalize as brought by Clearbit API which is already
//cleaning things any wonky cases for us
} else {
companyLogoUrl = '';
log("unfortunately we could not find the company logo url for this job post (no biggie)");
}
log("companyLogoUrl is : " + companyLogoUrl);
/************* set value for listing_duration *************************/
//n/a no limit on organic listings
/************* set value for listing_type *************************/
const listingType = "organic";
log("listingType is : " + listingType);
/************* value for paid_listing_job_requirement *************************/
//n/a
/************* value for paid_listing_job_responsibility *************************/
//n/a
/************* value for paid_listing_job_application_instructions *************************/
//n/a
/************* value for paid_listing_payment_id *************************/
//n/a
/************* value for paid_listing_paid_amount *************************/
//n/a
/************* value for paid_listing_payment_currency *************************/
//n/a
/************* value for paid_listing_payment_status *************************/
//n/a
/************* value for paid_listing_contact_email *************************/
//security safeguard so that if destinaiton_url empty too the feature when destinaiton_url AND empty
//results in appearance on the Fe of a button "report broken link/empty link)
//if not empty, this feature could fail
//should be extremely rare as 99% of listings will anyway have a destination_url
const listingApplyEmail = "";
log("listingApplyEmail is : " + listingApplyEmail);
/************* value for listing_url *************************/
//not performed on Lambda level but on db level (on Baqend performed by "Modules")
//only solution to avoid concurrency issues and allow the "last mile"=the db
//to make sure it never sets the same value for 2 jobs (uniqueness achieved via
//job_offer_id, itself being set on db level
/*
move it to the top of file to return early and not do all these expensive noemziation
*/
//check all baqend comuns and check if i have them: a lot will require normazation ex: slugs, ex initials...
//but only filter out/return if thoe other job with the same desitnaiton url
//is recent(<2 weeks ago)
/* Connect to Db and inject the above calculated values
Note: rejection of listings with a normalizedJobApplyUrl already present in an existing record
is only performed at db level by a Baqend module in order to stay as close as possible as db server
(for efficiency and concurrency aspects)
*/
try {
await connect_to_baqend("listings-network");
const response = await DB[DB_NAME](
{
target_website: WEBSITE_NAME_INCLUDING_DEV_MODE,
listing_category: 'j',
listing_master_scraping_query: jobMasterScrapingQuery,
listing_data_source_type: jobDataSourceType,
listing_data_source_url: jobDataSourceUrl,
listing_title: normalizedJobPosition,
listing_location_restricted: jobLocationRestricted,
listing_location_country: normalizedjobLocationCountry,
listing_location_country_iso3166: jobLocationCountryIso3166,
listing_location_city: normalizedjobLocationCity,
listing_location_city_slug: jobLocationCitySlug,
listing_tags: normalizedJobsTagArr,
listing_tags_slug: jobTagsSlugArr,
listing_description: jobDescription,
listing_destination_url: normalizedJobApplyUrl,
listing_apply_email: listingApplyEmail,
content_entity_name: normalizedCompanyName,
content_entity_name_slug: companyNameSlug,
content_entity_initials: companyInitials,
content_entity_domain: companyDomain,
content_entity_logo_url: companyLogoUrl,
listing_type: listingType
}).insert();
log("we injected into the db the data of " + context.awsRequestId);
} catch (e) {
//note: if we have an error in the code inside Baqend handlers (ex: onInsert),
//then the error will be "conveyed"/transferred here below and we'll get
//the error message below
//note: do we really xwant this on dashbird: not sure!!!
//it can happen and then? is iut realyl serious?...that life
console.log('Something went wrong connecting to baqend', e)
}
return {
statusCode: 200,
body: 'Baqend: Inserted'
}
});
export {handler};
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment