Skip to content

Instantly share code, notes, and snippets.

@parosky
Created December 11, 2013 09:40
Show Gist options
  • Save parosky/7907612 to your computer and use it in GitHub Desktop.
Save parosky/7907612 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import json
import urllib2
import urllib
import os
import glob
import PIL
import leargist
import numpy as np
import sklearn.decomposition
import sklearn.mixture
bing_id = YOUR_BING_ID
bing_key = YOUR_BING_KEY
def get_images(query):
if not os.path.exists(query):
os.mkdir(query)
skips = [0, 50]
count = 0
for skip in skips:
url = u'https://api.datamarket.azure.com/Bing/Search/Image?$format=json&Query=%27{{query}}%27&$skip={{skip}}'
url = url.replace('{{query}}', urllib.quote(query.encode('utf-8', 'ignore')))
url = url.replace('{{skip}}', str(skip))
password_mgr = urllib2.HTTPPasswordMgrWithDefaultRealm()
password_mgr.add_password(None, 'http://api.datamarket.azure.com', bing_id, bing_key)
handler = urllib2.HTTPBasicAuthHandler(password_mgr)
opener = urllib2.build_opener(handler)
urllib2.install_opener(opener)
line = urllib2.urlopen(url).read()
open('%s/%d.txt' % (query, skip), 'w').write(line)
r = json.loads(line)
for item in r['d']['results']:
img_url = item['MediaUrl']
print count, img_url
try:
urllib.urlretrieve(img_url.encode('utf-8', 'ignore'), '%s/%d.jpg' % (query, count))
except:
print 'error'
continue
count += 1
def extract_features(directory):
features = []
filenames = glob.glob('%s/*.jpg' % directory)
filenames.sort(lambda a,b: int(a[a.index('/')+1:a.rindex('.')])-int(b[b.index('/')+1:b.rindex('.')]))
for filename in filenames:
num = filename[filename.index('/')+1:filename.rindex('.')]
try:
im = PIL.Image.open(filename)
except:
print 'load error'
features.append(leargist.color_gist(im))
features = np.array(features)
np.save('%s/features' % directory, features)
def kl(g1, g2):
m1 = g1.means_[0]
m2 = g2.means_[0]
c1 = np.diag(g1.covars_[0])
c2 = np.diag(g2.covars_[0])
ret = np.log(np.linalg.det(c2)/np.linalg.det(c1))
ret += np.trace(np.dot(np.linalg.inv(c2),c1))
ret += np.dot(np.dot((m2-m1).T,np.linalg.inv(c2)), (m2-m1))
ret += -len(m1)
ret *= 1./2.
return ret
def run():
towns = [u'高円寺', u'下北沢', u'霞ヶ関', u'池袋', u'渋谷', u'浅草', u'新宿', u'神楽坂', u'代官山', u'表参道', u'原宿', u'バラナシ', u'デリー', u'ムンバイ', u'川越', u'京都']
# get images
for town in towns:
get_images(town)
# extract features
for town in towns:
extract_features(town)
# load extracted features
d = []
for town in towns:
d.append(np.load(u'%s/features.npy' % town))
# PCA
pca = sklearn.decomposition.PCA(n_components=20)
pca.fit(np.vstack(d))
# GMM fitting
g = []
for i,town in enumerate(towns):
g.append(sklearn.mixture.GMM(n_components=1, covariance_type='diag'))
g[i].fit(pca.transform(d[i]))
similarities = np.zeros((len(towns),len(towns)))
# compute KL divergence
for i in range(len(towns)):
for j in range(len(towns)):
similarities[i,j] = (kl(g[j],g[i])+kl(g[i],g[j])/2)
# show results
r = []
for i in range(1,len(towns)):
r.append((similarities[0, i], towns[i]))
r.sort()
for rr in r:
print rr[1], rr[0]
if __name__ == '__main__':
run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment