Skip to content

Instantly share code, notes, and snippets.

@fiatjaf
Last active October 7, 2015 23:37
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fiatjaf/d3beecacbd2ea054c587 to your computer and use it in GitHub Desktop.
Save fiatjaf/d3beecacbd2ea054c587 to your computer and use it in GitHub Desktop.
opinionated URL normalization function written in PL/pgSQL

A function that takes a URL string and returns it with

  • https replaced by http (all sites support http endpoints, only some support https);
  • www. removed (all sites should support naked domains);
  • ending slash / removed from path;
  • lowercased domain (but not path);
  • querystring parameters removed, except those listed in the allowed_params table, which expects records like the following:
              hostpath               |  param   
-------------------------------------+----------
 news.ycombinator.com/user           | id
 news.ycombinator.com/threads        | id
 news.ycombinator.com/submitted      | id
 youtube.com/watch                   | v
 youtube.com/playlist                | list
 books.google.com                    | id
 books.google.com.br                 | id
 drive.google.com/folderview         | id
 cuapress.cua.edu/books/viewbook.cfm | book
 www.ucpress.edu/book.php            | isbn
 news.ycombinator.com/saved          | id
 news.ycombinator.com/saved          | comments

Examples:

::DATABASE=> select normalize('https://www.baNANas.com/uVA/?utm=32');
       normalize        
------------------------
 http://bananas.com/uVA
(1 row)

::DATABASE=> select normalize('https://news.ycombinator.com/user?id=fiatjaf');
                  normalize                  
---------------------------------------------
 http://news.ycombinator.com/user?id=fiatjaf
(1 row)
CREATE OR REPLACE FUNCTION normalize(url text) RETURNS text AS $$
DECLARE hp text; pathstart int; search text;
qs json; qspair text; qskeys text[]; qsvals text[]; r RECORD;
BEGIN
hp := split_part(url, '?', 1);
search := split_part(url, '?', 2);
hp = CASE WHEN substring(hp from 0 for 9) = 'https://' -- https to http
THEN substring(hp from 9) ELSE substring(hp from 8)
END;
hp = CASE WHEN substring(hp from 0 for 5) = 'www.' -- remove www.
THEN substring(hp from 5) ELSE hp
END;
pathstart := position('/' in hp); -- lowercase domain, but not path
hp = lower(substring(hp from 0 for pathstart)) || substring(hp from pathstart);
-- normalize query string
IF search IS NOT NULL THEN
qskeys := ARRAY[]::text[];
qsvals := ARRAY[]::text[];
FOREACH qspair IN ARRAY regexp_split_to_array(search, E'\&') LOOP
qskeys = array_append(qskeys, split_part(qspair, '=', 1));
qsvals = array_append(qsvals, split_part(qspair, '=', 2));
END LOOP;
qs := json_object(qskeys, qsvals);
search = '';
FOR r IN SELECT DISTINCT key, value FROM json_each_text(qs) WHERE key IN (SELECT param FROM allowed_params WHERE hostpath = hp) ORDER BY key LOOP
search = search || '&' || r.key || '=' || r.value;
END LOOP;
search = ltrim(search, '&');
return 'http://' || rtrim(hp, '/') || CASE WHEN search <> '' THEN '?' ELSE '' END || search;
END IF;
RETURN 'http://' || rtrim(hp, '/');
END;
$$ LANGUAGE plpgsql;
select normalize('https://www.baNANas.com/uVA/?utm=32');
select normalize('https://news.ycombinator.com/user?id=fiatjaf');
CREATE TABLE allowed_params (
hostpath text,
param text
);
CREATE INDEX ON allowed_params(hostpath);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment