Skip to content

Instantly share code, notes, and snippets.

@nvbn
Created November 21, 2018 00:15
Show Gist options
  • Save nvbn/1a8fb134917e906527cc1a178e004c70 to your computer and use it in GitHub Desktop.
Save nvbn/1a8fb134917e906527cc1a178e004c70 to your computer and use it in GitHub Desktop.
SA trip analysis
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Simple image classification with Inception.
Run image classification with Inception trained on ImageNet 2012 Challenge data
set.
This program creates a graph from a saved GraphDef protocol buffer,
and runs inference on an input JPEG image. It outputs human readable
strings of the top 5 predictions along with their probabilities.
Change the --image_file argument to any jpg image to compute a
classification of that image.
Please see the tutorial and website for a detailed description of how
to use this script to perform image recognition.
https://tensorflow.org/tutorials/image_recognition/
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import re
import sys
import tarfile
import numpy as np
from six.moves import urllib
import tensorflow as tf
# pylint: disable=line-too-long
DATA_URL = 'http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz'
# pylint: enable=line-too-long
model_dir = '/tmp/imagenet'
class NodeLookup(object):
"""Converts integer node ID's to human readable labels."""
def __init__(self,
label_lookup_path=None,
uid_lookup_path=None):
if not label_lookup_path:
label_lookup_path = os.path.join(
model_dir, 'imagenet_2012_challenge_label_map_proto.pbtxt')
if not uid_lookup_path:
uid_lookup_path = os.path.join(
model_dir, 'imagenet_synset_to_human_label_map.txt')
self.node_lookup = self.load(label_lookup_path, uid_lookup_path)
def load(self, label_lookup_path, uid_lookup_path):
"""Loads a human readable English name for each softmax node.
Args:
label_lookup_path: string UID to integer node ID.
uid_lookup_path: string UID to human-readable string.
Returns:
dict from integer node ID to human-readable string.
"""
if not tf.gfile.Exists(uid_lookup_path):
tf.logging.fatal('File does not exist %s', uid_lookup_path)
if not tf.gfile.Exists(label_lookup_path):
tf.logging.fatal('File does not exist %s', label_lookup_path)
# Loads mapping from string UID to human-readable string
proto_as_ascii_lines = tf.gfile.GFile(uid_lookup_path).readlines()
uid_to_human = {}
p = re.compile(r'[n\d]*[ \S,]*')
for line in proto_as_ascii_lines:
parsed_items = p.findall(line)
uid = parsed_items[0]
human_string = parsed_items[2]
uid_to_human[uid] = human_string
# Loads mapping from string UID to integer node ID.
node_id_to_uid = {}
proto_as_ascii = tf.gfile.GFile(label_lookup_path).readlines()
for line in proto_as_ascii:
if line.startswith(' target_class:'):
target_class = int(line.split(': ')[1])
if line.startswith(' target_class_string:'):
target_class_string = line.split(': ')[1]
node_id_to_uid[target_class] = target_class_string[1:-2]
# Loads the final mapping of integer node ID to human-readable string
node_id_to_name = {}
for key, val in node_id_to_uid.items():
if val not in uid_to_human:
tf.logging.fatal('Failed to locate: %s', val)
name = uid_to_human[val]
node_id_to_name[key] = name
return node_id_to_name
def id_to_string(self, node_id):
if node_id not in self.node_lookup:
return ''
return self.node_lookup[node_id]
def create_graph():
"""Creates a graph from saved GraphDef file and returns a saver."""
# Creates graph from saved graph_def.pb.
with tf.gfile.FastGFile(os.path.join(
model_dir, 'classify_image_graph_def.pb'), 'rb') as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
_ = tf.import_graph_def(graph_def, name='')
def run_inference_on_image(image, num_top_predictions):
"""Runs inference on an image.
Args:
image: Image file name.
Returns:
Nothing
"""
if not tf.gfile.Exists(image):
tf.logging.fatal('File does not exist %s', image)
image_data = tf.gfile.FastGFile(image, 'rb').read()
result = []
with tf.Session() as sess:
# Some useful tensors:
# 'softmax:0': A tensor containing the normalized prediction across
# 1000 labels.
# 'pool_3:0': A tensor containing the next-to-last layer containing 2048
# float description of the image.
# 'DecodeJpeg/contents:0': A tensor containing a string providing JPEG
# encoding of the image.
# Runs the softmax tensor by feeding the image_data as input to the graph.
softmax_tensor = sess.graph.get_tensor_by_name('softmax:0')
predictions = sess.run(softmax_tensor,
{'DecodeJpeg/contents:0': image_data})
predictions = np.squeeze(predictions)
# Creates node ID --> English string lookup.
node_lookup = NodeLookup()
top_k = predictions.argsort()[-num_top_predictions:][::-1]
for node_id in top_k:
human_string = node_lookup.id_to_string(node_id)
score = predictions[node_id]
result.append((human_string, score))
print('%s (score = %.5f)' % (human_string, score))
return result
def maybe_download_and_extract(dest_directory):
"""Download and extract model tar file."""
if not os.path.exists(dest_directory):
os.makedirs(dest_directory)
filename = DATA_URL.split('/')[-1]
filepath = os.path.join(dest_directory, filename)
if not os.path.exists(filepath):
def _progress(count, block_size, total_size):
sys.stdout.write('\r>> Downloading %s %.1f%%' % (
filename, float(count * block_size) / float(total_size) * 100.0))
sys.stdout.flush()
filepath, _ = urllib.request.urlretrieve(DATA_URL, filepath, _progress)
print()
statinfo = os.stat(filepath)
print('Successfully downloaded', filename, statinfo.st_size, 'bytes.')
tarfile.open(filepath, 'r:gz').extractall(dest_directory)
def init():
# Creates graph from saved GraphDef.
create_graph()
maybe_download_and_extract(model_dir)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# classify_image_graph_def.pb:
# Binary representation of the GraphDef protocol buffer.
# imagenet_synset_to_human_label_map.txt:
# Map from synset ID to a human readable string.
# imagenet_2012_challenge_label_map_proto.pbtxt:
# Text representation of a protocol buffer mapping a label to synset ID.
parser.add_argument(
'--model_dir',
type=str,
default='/tmp/imagenet',
help="""\
Path to classify_image_graph_def.pb,
imagenet_synset_to_human_label_map.txt, and
imagenet_2012_challenge_label_map_proto.pbtxt.\
"""
)
parser.add_argument(
'--image_file',
type=str,
default='',
help='Absolute path to image file.'
)
parser.add_argument(
'--num_top_predictions',
type=int,
default=5,
help='Display this many predictions.'
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
from datetime import datetime, timedelta
import glob
import json
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as tkr
import tweepy
import PIL.Image
import PIL.ExifTags
import pandas as pd
from . import classify_image
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
TWITTER_CONSUMER_KEY = ''
TWITTER_CONSUMER_SECRET = ''
TWITTER_ACCESS_TOKEN = ''
TWITTER_ACCESS_TOKEN_SECRET = ''
USER_ID = '21653573'
MARKER = '✈'
def get_tweets():
auth = tweepy.OAuthHandler(TWITTER_CONSUMER_KEY, TWITTER_CONSUMER_SECRET)
auth.set_access_token(TWITTER_ACCESS_TOKEN, TWITTER_ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
cursor = tweepy.Cursor(api.user_timeline,
user_id=USER_ID,
exclude_replies='false',
include_rts='false',
count=200)
return cursor.items()
# Get tweets about flights
all_tweets = pd.DataFrame(
[(tweet.text, tweet.created_at) for tweet in get_tweets()],
columns=['text', 'created_at'])
tweets_in_dates = all_tweets[
(all_tweets.created_at > datetime(2018, 9, 8)) & (all_tweets.created_at < datetime(2018, 9, 30))]
flights_tweets = tweets_in_dates[tweets_in_dates.text.str.upper() == tweets_in_dates.text]
flights_tweets = flights_tweets.assign(start=lambda df: df.text.str.split(MARKER).str[0])
flights_tweets = flights_tweets.assign(finish=lambda df: df.text.str.split(MARKER).str[-1])
flights = flights_tweets[['start', 'finish', 'created_at']]
flights = flights.sort_values('created_at')
def get_iata_to_city():
with open('airports.json') as f:
data = json.load(f)
return {airport['iata']: airport['city']
for airport in data.values()
if airport['iata']}
iata_to_city = get_iata_to_city()
iata_to_city['EZE'] = 'Buenos-Aires'
flights = flights.assign(
start=flights.start.apply(lambda code: iata_to_city[re.sub(r'\W+', '', code)]),
finish=flights.finish.apply(lambda code: iata_to_city[re.sub(r'\W+', '', code)]))
cities = flights.assign(
spent=flights.created_at - flights.created_at.shift(1),
city=flights.start,
arrived=flights.created_at.shift(1),
)[["city", "spent", "arrived"]]
cities = cities.assign(left=cities.arrived + cities.spent)[cities.spent.dt.days > 0]
formatter = tkr.FuncFormatter(lambda x, _: str(timedelta(seconds=x / 1000000000)))
cities.plot(x="city", y="spent", kind="bar",
legend=False, title='Cities') \
.yaxis.set_major_formatter(formatter)
plt.tight_layout()
def read_photos():
for name in glob.glob('photos/*.jpg'):
img = PIL.Image.open(name)
exif = {
PIL.ExifTags.TAGS[k]: v
for k, v in img._getexif().items()
if k in PIL.ExifTags.TAGS
}
yield name, datetime.strptime(exif['DateTime'], '%Y:%m:%d %H:%M:%S')
raw_photos = pd.DataFrame(list(read_photos()), columns=['name', 'created_at'])
photos_cities = raw_photos.assign(key=0).merge(cities.assign(key=0), how='outer')
photos = photos_cities[
(photos_cities.created_at >= photos_cities.arrived)
& (photos_cities.created_at <= photos_cities.left)
]
photos_by_city = photos \
.groupby(by='city') \
.agg({'name': 'count'}) \
.rename(columns={'name': 'photos'}) \
.reset_index()
photos_by_city.plot(x='city', y='photos', kind="bar",
title='Photos by city', legend=False)
plt.tight_layout()
classify_image.init()
tags = photos.name\
.apply(lambda name: classify_image.run_inference_on_image(name, 1)[0]) \
.apply(pd.Series)
tags.columns = ['tag', 'score']
tagged_photos = photos.copy()
tagged_photos[['tag', 'score']] = tags.apply(pd.Series)
tagged_photos['tag'] = tagged_photos.tag.apply(lambda tag: tag.split(', ')[0])
photos_by_tag = tagged_photos[['tag', 'name']] \
.groupby(by='tag') \
.agg({'name': 'count'}) \
.rename(columns={'name': 'photos'}) \
.reset_index() \
.sort_values('photos', ascending=False) \
.head(10)
photos_by_tag.plot(x='tag', y='photos', kind='bar',
legend=False, title='Popular tags'); plt.tight_layout()
popular_tags = photos_by_tag.head(5).tag
popular_tagged = tagged_photos[tagged_photos.tag.isin(popular_tags)]
not_popular_tagged = tagged_photos[~tagged_photos.tag.isin(popular_tags)].assign(
tag='other')
by_tag_city = popular_tagged \
.append(not_popular_tagged) \
.groupby(by=['city', 'tag']) \
.count()['name'] \
.unstack(fill_value=0)
by_tag_city.plot(kind='bar', stacked=True)
plt.tight_layout()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment