Skip to content

Instantly share code, notes, and snippets.

@aflaxman
Created Feb 15, 2010
Embed
What would you like to do?
""" Class for holding species occurrence data, and associated covariates"""
import csv
class Data:
def __init__(self, fname='data.csv'):
self.raw_data = [d for d in csv.DictReader(open(fname))]
self.seen = []
self.unseen = []
self.cov = {}
for d in self.raw_data:
z = (float(d['lon']), float(d['lat']))
if int(d['seen']) == 1:
self.seen.append(z)
else:
self.unseen.append(z)
self.cov[z] = [float(d[k]) for k in d if k.startswith('cov')]
def covariates(self, z):
return self.cov[z]
def cov_dim(self):
return len(self.cov.values()[0])
def plot(self):
from pylab import plot
x = [z[0] for z in self.unseen]
y = [z[1] for z in self.unseen]
if len(self.seen+self.unseen) < 200:
plot(x, y, 'o', mfc='white', mew=1, ms=7, color='black')
else:
plot(x, y, ',', color='cyan')
x = [z[0] for z in self.seen]
y = [z[1] for z in self.seen]
if len(self.seen+self.unseen) < 200:
plot(x, y, 'o', mfc='black', mew=1, ms=7, color='black')
else:
plot(x, y, ',', color='black')
def generate_synthetic_csv(fname='data.csv'):
from pymc import rnormal, rbinomial
from numpy.linalg import norm
K = 20
n = 10000
phi = .9
columns = ["lon","lat","seen"] + ['cov%d'%k for k in range(K)]
f = open(fname, 'w')
fc = csv.DictWriter(f, columns)
fc.writerow(dict(zip(columns, columns)))
d = {}
for i in range(n):
lon = rnormal(0, 1)
lat = rnormal(0, 1)
d['lon'] = lon
d['lat'] = lat
#if max(abs(lat),abs(lon)) < .5:
if lat > .5 and lon < .5:
d['seen'] = rbinomial(1, phi)
else:
d['seen'] = 0
# make covariates
for k in range(K):
d['cov%d'%k] = rnormal(0,1)
d['cov0'] = 1
if d['lon'] > 0:
d['cov1'] = 1
else:
d['cov1'] = 0
if d['lat'] > 0:
d['cov2'] = 1
else:
d['cov2'] = 0
d['cov3'] = lat
d['cov4'] = lon
if norm([lat, lon]) < 1.:
d['cov5'] = 0
else:
d['cov5'] = 0
if lat < .5:
d['cov6'] = 1
else:
d['cov6'] = 0
if lat > -.5:
d['cov7'] = 1
else:
d['cov7'] = 0
if lon < .5:
d['cov8'] = 1
else:
d['cov8'] = 0
if lon > -.5:
d['cov9'] = 1
else:
d['cov9'] = 0
fc.writerow(d)
f.close()
import data
data.generate_synthetic_csv()
import model
m = model.Model()
m.fit()
m.plot()
""" Class to hold the model variables and do some fitting"""
from numpy import *
from pymc import *
import data
class Model:
def __init__(self, fname='data.csv'):
self.data = data.Data(fname)
# approximation parameters (a for penalty, b for sharpness, to be driven to inf)
self.a = Uninformative('a', value=100.)
self.b = Uninformative('b', value=10.)
# parameters to be estimates - coefficients/effects
self.c = Normal('c', mu=zeros(self.data.cov_dim()), tau=1.)
self.phi = Beta('phi', 200, 200)
# constraints for sites that saw species:
# c*e(x) >= 0 (implemented as exponential penalty)
self.seen_e_x = np.array([self.data.covariates(z) for z in self.data.seen])
self.mu_seen = Lambda('mu_seen', lambda c=self.c, e_x=self.seen_e_x,
sharpness=self.b: invlogit(sharpness * dot(e_x, c)))
@potential
def seen_in_region(mu=self.mu_seen, penalty=self.a):
return penalty * sum(mu)
self.seen_in_region = seen_in_region
# likelihood for site that did not see species:
# implemented as a continuous pdf
self.unseen_e_x = np.array([self.data.covariates(z) for z in self.data.unseen])
self.mu_unseen = Lambda('mu_unseen', lambda c=self.c, e_x=self.unseen_e_x,
sharpness=self.b: invlogit(sharpness * dot(e_x, c)))
@potential
def unseen_in_region(mu=self.mu_unseen, phi=self.phi):
return sum(mu) * log(1-phi)
self.unseen_in_region = unseen_in_region
self.map = MAP([self.c,
self.mu_seen, self.mu_unseen,
self.seen_in_region, self.unseen_in_region])
self.mc = MCMC(self)
self.mc.use_step_method(NoStepper, self.a)
self.mc.use_step_method(NoStepper, self.b)
#self.mc.use_step_method(NoStepper, self.phi)
#self.mc.use_step_method(Metropolis, self.c,
# proposal_sd=.01, proposal_distribution='Normal')
#self.mc.use_step_method(HitAndRun, self.c,
# proposal_sd=.01, verbose=1)
def fit(self, a=20., b=20.):
""" by increasing self.a.value and self.b.value, the
approximation comes closer to the intended objective"""
self.a.value = a
self.b.value = b
self.phi.value = .9
self.mc.sample(35000, 10000, 1000, verbose=1)
def plot(self):
from pylab import axis, imshow, contour, contourf, \
hist, plot, figure, clf, subplot, acorr, \
xticks, yticks, xlabel, ylabel, title, cm, \
axes, bar
from scipy import ndimage
cols = 3
clf()
d = min(self.data.cov_dim(), 10) # only show first 10 coeffs
for ii in range(d):
subplot(d+2, cols, cols*ii+1)
acorr((self.c.trace() - mean(self.c.trace(),0))[:,ii])
xticks([])
yticks([])
ylabel('$c_%d$'%ii, rotation=0)
subplot((d+2)/2, cols, cols*((d+2)/2-1)+1)
bar(range(len(self.c.value)),
self.c.trace().mean(0), yerr=self.c.trace().std(0)*1.96,
fc='none')
xticks([])
yticks([])
xlabel('vals', fontsize=8)
subplot(3, cols, 2)
hist(self.phi.trace())
title('$\phi$')
xticks(fontsize=8)
yticks([])
subplot(3, cols, cols + 2)
plot(self.phi.trace())
xticks([])
yticks([])
subplot(3, cols, 2*cols + 2)
acorr(self.phi.trace() - self.phi.trace().mean(0))
xticks([])
yticks([])
subplot(1, cols, 3)
self.data.plot()
xticks([])
yticks([])
for mu_s, mu_u in zip(self.mu_seen.trace(), self.mu_unseen.trace()):
# create grid to approximate mu over
w = 100.
h = 100.
l,r,b,t =axis()
X = arange(l, r, (r-l)/w)
Y = arange(b, t, (t-b)/h)
Z = zeros((len(Y), len(X)))
# evaluate c * e_x for seen and unseen z, do gaussian
# smoothing, and show contour map of results
mu_list = zip(self.data.seen, 2*mu_s-1) \
+ zip(self.data.unseen, 2*mu_u-1)
for z_i, mu_i in mu_list:
Z[h*(z_i[1]-b)/(t-b), w*(z_i[0]-l)/(r-l)] = mu_i
Z = ndimage.gaussian_filter(Z, sqrt(200.)/sqrt(len(mu_list)))
#contour(X + .5*(r-l)/w, Y + .5*(t-b)/h, Z, [-2., -.5, 0., .5, 2.],
# colors='black', alpha=.05, linewidth=2)
contourf(X + .5*(r-l)/w, Y + .5*(t-b)/h, Z, [-2., -.5, 0., .5, 2.],
colors=[(0,1,0,0),(1,1,1,1),(1,0,0),(1,.5,0),(1,.5,.5)], alpha=.05)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment