Skip to content

Instantly share code, notes, and snippets.

@plablo09
Last active March 25, 2022 16:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save plablo09/c3b6f5aa57aec6e1226f714a7426c8d0 to your computer and use it in GitHub Desktop.
Save plablo09/c3b6f5aa57aec6e1226f714a7426c8d0 to your computer and use it in GitHub Desktop.
twitter_context database process
--twitts table
create table tweets
(
t_id bigint,
uname text,
n_id bigint,
loc_text text,
u_img text,
lat float,
lon float,
date_str text,
content text,
clas text
);
--create geometry column and populate it
select AddGeometryColumn('tweets','geom',4326,'Point',2);
update tweets set geom = ST_SetSRID(ST_MakePoint(lon,lat),4326);
--Spatial index
CREATE INDEX tweets_idx_gist ON tweets USING gist(geom);
--Unique identifier and index:
CREATE SEQUENCE tweet_ids;
ALTER TABLE tweets ADD id INT UNIQUE;
UPDATE tweets SET id = NEXTVAL('tweet_ids');
--Set date_str as real date
alter table tweets
alter column date_str type timestamp
using date_str::timestamptz
--Copy the central region to a table:
create table extent as
select 1, ST_SetSRID(ST_Extent(geom),4326) as geom FROM denue;
create table tweets_centro as select t.*
from tweets t, extent d
where st_intersects(t.geom,d.geom )
--Index the table.
alter table tweets_centro
add unique (id);
create index tweet_centro_gist on tweets_centro using gist (geom)
--Reproject to planar coordinates.
ALTER TABLE tweets_centro
ALTER COLUMN geom
TYPE Geometry(Point, 32614)
USING ST_Transform(geom, 32614);
--Also reproject denue:
ALTER TABLE denue
ALTER COLUMN geom
TYPE Geometry(Point, 32614)
USING ST_Transform(geom, 32614);
--Drop unnedded columns from denue:
alter table denue
drop column raz_social,
drop column nombre_act,
drop column tipo_vial,
drop column nom_vial,
drop column tipo_v_e_1,
drop column nom_v_e_1,
drop column tipo_v_e_2,
drop column nom_v_e_2,
drop column tipo_v_e_3,
drop column nom_v_e_3,
drop column numero_ext,
drop column letra_ext,
drop column edificio,
drop column edificio_e,
drop column numero_int,
drop column letra_int,
drop column tipo_asent,
drop column nomb_asent,
drop column nom_cencom,
drop column num_local,
drop column telefono,
drop column correoelec,
drop column www;
-- Amount of jobs per economic unit
alter table denue add column avg_jobs int;
update denue set avg_jobs =
case
when per_ocu like '0%' then 2
when per_ocu like '6%' then 5
when per_ocu like '11%' then 15
when per_ocu like '31%' then 25
when per_ocu like '51%' then 50
when per_ocu like '101%' then 175
when per_ocu like '251%' then 251
else 0
end;
--block level id for each unit.
alter table denue add column cve_mza text;
update denue set cve_mza = cve_ent || cve_mun || cve_loc || ageb || manzana;
-- Set activity classes:
alter table denue add column id_act text;
update denue set id_act =
case
when codigo_act like '%31' or codigo_act like '%32' or codigo_act like '%33'then 'IN'
when codigo_act like '461%' or codigo_act like '462%' or codigo_act like '463%' or
codigo_act like '464%' or codigo_act like '465%' or codigo_act like '466%'then 'CO'
when codigo_act like '48111%' or codigo_act like '485210' then 'TR'
when codigo_act like '51%' or codigo_act like '521%' or codigo_act like '523%' or codigo_act like '524%' or codigo_act like '5312%'
or codigo_act like '5313%' or codigo_act like '541%' or codigo_act like '55%' then 'OF'
when codigo_act like '6111%' or codigo_act like '6115%' then 'ES'
when codigo_act like '621%' or codigo_act like '622%' then 'SA'
when codigo_act like '711121%' or codigo_act like '71212%' or codigo_act like '7132%' or codigo_act like '7139%'
or codigo_act like '7211%' or codigo_act like '7224%' or codigo_act like '7225%' then 'OC'
when codigo_act like '931%' or codigo_act like '622%' then 'GO'
else 'NO CLASS'
end
--create an index for id_act column:
create index denue_id_act_idex on denue (id_act)
--Update tweets table with activity count (one of these per activity type)
alter table tweets add column industria int;
update tweets_centro set industria = foo.cuantos
from
(
select t.id, count(d.id) -1 as cuantos
from tweets_centro t, denue d
where st_dwithin(t.geom,d.geom,500) and d.id_act like'IN'
group by t.id) as foo
where tweets_centro.id = foo.id
--For transport is a little different: we'll grab the airports from one table:
update tweets_centro set transporte_count = foo.cuantos
from
(
select t.id, count(d.id) -1 as cuantos
from tweets_centro t, aeropuertos d
where st_dwithin(t.geom,d.geom,500)
group by t.id) as foo
where tweets_centro.id = foo.id
--And Bus stations from denue, then we'll jus add to the count:
update tweets_centro set transporte_count = transporte_count + foo.cuantos
from
(
select t.id, count(d.id) -1 as cuantos
from tweets_centro t,
(select * from denue where tipocencom like 'CENTRAL CAMIONERA') as d
where st_dwithin(t.geom,d.geom,500)
group by t.id) as foo
where tweets_centro.id = foo.id
-- Compute jobs per city block and store it in manzanas table:
create index manzanas_cvgeo_idx on manzanas (cvegeo);
create index denue_cv_mza_idx on denue (cve_mza);
alter table manzanas add column jobs int;
update manzanas m set jobs = sub.jobs
from
(
select cve_mza, sum(avg_jobs) as jobs
from denue
group by cve_mza
) as sub
where m.cvegeo = sub.cve_mza
-- Compute job to housing ratio:
update manzanas set job_housing = jobs::float/viv_0::float
where cvegeo like '29%' and jobs is not null and viv_0 !=0
--create boolean column for clase transporte:
alter table tweets_centro add column trans_bool boolean;
update tweets_centro set trans_bool = false where transporte_count is null
update tweets_centro set trans_bool = true where transporte_count is not null
-- Add interval column and classify accordint to weekday and time:
alter table tweets_centro add column intervalo text;
update tweets_centro set intervalo =
case
when date_str::time >= '05:00:00' and date_str::time < '10:00:00' then to_char(date_str, 'dy') || '.' || '01'
when date_str::time >= '10:00:00' and date_str::time < '14:00:00' then to_char(date_str, 'dy') || '.' || '02'
when date_str::time >= '14:00:00' and date_str::time < '16:00:00' then to_char(date_str, 'dy') || '.' || '03'
when date_str::time >= '16:00:00' and date_str::time < '19:00:00' then to_char(date_str, 'dy') || '.' || '04'
when date_str::time >= '19:00:00' and date_str::time < '22:00:00' then to_char(date_str, 'dy') || '.' || '05'
when date_str::time >= '22:00:00' and date_str::time <= '23:59:59' then to_char(date_str, 'dy') || '.' || '06'
when date_str::time >= '00:00:00' and date_str::time < '05:00:00' then to_char(date_str, 'dy') || '.' || '01'
else 'n/a'
end;
--Store block_id in tweets table for those twits that are inside blocks:
alter table tweets_centro add column cve_mza text;
update tweets_centro set cve_mza = sub.cvegeo
from
(select t.id, m.cvegeoinventario_vivienda
from tweets_centro t, manzanas m
where st_intersects(m.geom,t.geom)) as sub
where sub.id = tweets_centro.id
-- For thos thad didnt lien within city blocks but are near enough:
update tweets_centro set cve_maza = sub.cvegeo
from(
select t.id, r.cvegeo, dist
from (select * from tweets_centro where cve_maza is null) as t
cross join lateral
(
select m.cvegeo, st_distance(m.geom,t.geom) as dist
from manzanas m
order by m.geom <-> t.geom
limit 1
) as r)as sub
where tweets_centro.id = sub.id and sub.dist <= 500.0
--Add job to housign to tuits table.
alter table tweets_centro add column job_housing float;
update tweets_centro set job_housing = sub.job_housing
from
(select job_housing, cvegeo from manzanas) as sub
where tweets_centro.cve_maza = sub.cvegeo
--Export balanced sample as csv
copy
(
select t_id,uname,loc_text,replace(content,'"',' ') as content, clas, id, industria,
comercio, oficinas, escuelas, salud, ocio, gobierno, trans_bool as trans, intervalo, job_housing
from tweets_centro
where job_housing is not null and clas = 'NEU'
order by random()
limit 5000
)
to '/var/pg_shared/muestra_NEU.csv'
WITH (FORMAT CSV, HEADER TRUE, FORCE_QUOTE *, DELIMITER '|')
--Create table for inventario_vivienda
create table inventario_vivienda
(
id int,
ent text,
nom_ent text,
mun text,
nom_mun text,
loc text,
nom_loc text,
ageb text,
mza text,
conjhab int,
recucall int,
banqueta int,
guarnici int,
arboles int,
rampas int,
alumpub int,
senaliza int,
telpub int,
drenajep int,
transcol int,
acesoper int,
acesoaut int,
puessemi int,
puesambu int,
vivtot int,
tvivhab int,
tvivparhab int,
vph_depto int,
pobtot int,
cve_mza text
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment