Skip to content

Instantly share code, notes, and snippets.

@aquilax
Created November 2, 2011 14:55
Show Gist options
  • Star 4 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save aquilax/1333830 to your computer and use it in GitHub Desktop.
Save aquilax/1333830 to your computer and use it in GitHub Desktop.
Naive Bayes classifier in PostgreSQL
--
-- PostgreSQL database dump
--
-- Dumped from database version 9.1.1
-- Dumped by pg_dump version 9.1.1
-- Started on 2011-11-02 16:53:36 EET
SET statement_timeout = 0;
SET client_encoding = 'UTF8';
SET standard_conforming_strings = on;
SET check_function_bodies = false;
SET client_min_messages = warning;
--
-- TOC entry 8 (class 2615 OID 28036)
-- Name: bayes; Type: SCHEMA; Schema: -; Owner: -
--
CREATE SCHEMA bayes;
SET search_path = bayes, pg_catalog;
--
-- TOC entry 223 (class 1255 OID 28111)
-- Dependencies: 607 8
-- Name: classify(integer, integer[]); Type: FUNCTION; Schema: bayes; Owner: -
--
CREATE FUNCTION classify(iset_id integer, ifeature_ids integer[]) RETURNS integer
LANGUAGE plpgsql COST 1
AS $$
DECLARE
features_count integer := 0;
r record;
p float := 0;
mult float := 0;
max_prob float := -1;
max_prob_cat integer := 0;
index integer := 1;
fcount integer := 0;
BEGIN
features_count = array_upper(ifeature_ids, 1);
FOR r IN
SELECT category_id, count(*) AS cntr
FROM bayes.feature_category
WHERE set_id = iset_id
GROUP BY category_id
LOOP
p = 1;
WHILE index <= features_count
LOOP
IF (r.cntr = 0) THEN
mult = 0.001;
ELSE
SELECT cnt
FROM bayes.feature_category
WHERE feature_id = ifeature_ids[index]
AND set_id = iset_id
AND category_id = r.category_id INTO fcount;
IF (fcount IS NULL) THEN
fcount = 0;
END IF;
mult = fcount / r.cntr;
END IF;
p = p * mult;
index = index + 1;
END LOOP;
--RAISE NOTICE 'cat % like %', r.category_id, p;
IF p > max_prob THEN
max_prob_cat = r.category_id;
max_prob = p;
END IF;
END LOOP;
RETURN max_prob_cat;
END;
$$;
--
-- TOC entry 224 (class 1255 OID 28110)
-- Dependencies: 607 8
-- Name: tokenize(character varying[], boolean); Type: FUNCTION; Schema: bayes; Owner: -
--
CREATE FUNCTION tokenize(ifeatures character varying[], ido_insert boolean) RETURNS integer[]
LANGUAGE plpgsql COST 1
AS $$
DECLARE
w varchar;
fid integer := 0;
index integer := 1;
count integer := 0;
outa int[];
BEGIN
count = array_upper(ifeatures, 1);
WHILE index <= count
LOOP
RAISE NOTICE '%', ifeatures[index];
SELECT id FROM bayes.feature WHERE val = ifeatures[index] INTO fid;
IF (fid IS NOT NULL) THEN
outa = array_append(outa, fid);
ELSE
IF ido_insert THEN
INSERT INTO bayes.feature (val) VALUES (ifeatures[index]);
outa = array_append(outa, lastval()::int);
END IF;
END IF;
index := index + 1;
END LOOP;
RETURN outa;
END;
$$;
--
-- TOC entry 222 (class 1255 OID 28092)
-- Dependencies: 607 8
-- Name: train(integer, integer, integer[]); Type: FUNCTION; Schema: bayes; Owner: -
--
CREATE FUNCTION train(iset_id integer, icategory_id integer, ifeature_ids integer[]) RETURNS void
LANGUAGE plpgsql COST 1
AS $$
DECLARE
w integer;
fid integer := 0;
index integer := 1;
count integer := 0;
BEGIN
count = array_upper(ifeature_ids, 1);
WHILE index <= count
LOOP
SELECT id
FROM bayes.feature_category
WHERE set_id = iset_id
AND category_id = icategory_id
AND feature_id = ifeature_ids[index] INTO fid;
IF (fid IS NOT NULL) THEN
UPDATE bayes.feature_category
SET cnt = cnt + 1
WHERE id = fid;
ELSE
INSERT INTO bayes.feature_category
(set_id, category_id, feature_id)
VALUES
(iset_id, icategory_id, ifeature_ids[index]);
END IF;
index := index + 1;
END LOOP;
END;
$$;
SET default_tablespace = '';
SET default_with_oids = false;
--
-- TOC entry 206 (class 1259 OID 28114)
-- Dependencies: 8
-- Name: feature; Type: TABLE; Schema: bayes; Owner: -; Tablespace:
--
CREATE TABLE feature (
id integer NOT NULL,
val character varying(50)
);
--
-- TOC entry 208 (class 1259 OID 28124)
-- Dependencies: 1975 8
-- Name: feature_category; Type: TABLE; Schema: bayes; Owner: -; Tablespace:
--
CREATE TABLE feature_category (
id integer NOT NULL,
set_id integer,
category_id integer,
feature_id integer,
cnt integer DEFAULT 1
);
--
-- TOC entry 207 (class 1259 OID 28122)
-- Dependencies: 208 8
-- Name: feature_category_id_seq; Type: SEQUENCE; Schema: bayes; Owner: -
--
CREATE SEQUENCE feature_category_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
--
-- TOC entry 1986 (class 0 OID 0)
-- Dependencies: 207
-- Name: feature_category_id_seq; Type: SEQUENCE OWNED BY; Schema: bayes; Owner: -
--
ALTER SEQUENCE feature_category_id_seq OWNED BY feature_category.id;
--
-- TOC entry 205 (class 1259 OID 28112)
-- Dependencies: 206 8
-- Name: feature_id_seq; Type: SEQUENCE; Schema: bayes; Owner: -
--
CREATE SEQUENCE feature_id_seq
START WITH 1
INCREMENT BY 1
NO MINVALUE
NO MAXVALUE
CACHE 1;
--
-- TOC entry 1987 (class 0 OID 0)
-- Dependencies: 205
-- Name: feature_id_seq; Type: SEQUENCE OWNED BY; Schema: bayes; Owner: -
--
ALTER SEQUENCE feature_id_seq OWNED BY feature.id;
--
-- TOC entry 1973 (class 2604 OID 28117)
-- Dependencies: 205 206 206
-- Name: id; Type: DEFAULT; Schema: bayes; Owner: -
--
ALTER TABLE feature ALTER COLUMN id SET DEFAULT nextval('feature_id_seq'::regclass);
--
-- TOC entry 1974 (class 2604 OID 28127)
-- Dependencies: 207 208 208
-- Name: id; Type: DEFAULT; Schema: bayes; Owner: -
--
ALTER TABLE feature_category ALTER COLUMN id SET DEFAULT nextval('feature_category_id_seq'::regclass);
--
-- TOC entry 1981 (class 2606 OID 28130)
-- Dependencies: 208 208
-- Name: feature_category_pkey; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace:
--
ALTER TABLE ONLY feature_category
ADD CONSTRAINT feature_category_pkey PRIMARY KEY (id);
--
-- TOC entry 1983 (class 2606 OID 28132)
-- Dependencies: 208 208 208 208
-- Name: feature_category_set_id_category_id_feature_id_key; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace:
--
ALTER TABLE ONLY feature_category
ADD CONSTRAINT feature_category_set_id_category_id_feature_id_key UNIQUE (set_id, category_id, feature_id);
--
-- TOC entry 1977 (class 2606 OID 28119)
-- Dependencies: 206 206
-- Name: feature_pkey; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace:
--
ALTER TABLE ONLY feature
ADD CONSTRAINT feature_pkey PRIMARY KEY (id);
--
-- TOC entry 1979 (class 2606 OID 28121)
-- Dependencies: 206 206
-- Name: feature_val_key; Type: CONSTRAINT; Schema: bayes; Owner: -; Tablespace:
--
ALTER TABLE ONLY feature
ADD CONSTRAINT feature_val_key UNIQUE (val);
-- Completed on 2011-11-02 16:53:37 EET
--
-- PostgreSQL database dump complete
--
@ggHj
Copy link

ggHj commented May 21, 2013

Hi, I tried your code. It seem that two things are missing.
Within classify its outer loop the index counter is not reseted.
Second one is maybe some pg version problem.
fcount has to be casted at my machine :
mult = CAST(fcount AS float)/ r.cntr;

@kangasaaron
Copy link

Could you give a tiny bit of instruction on how to use this?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment