Skip to content

Instantly share code, notes, and snippets.

@neelaryan
Created July 22, 2016 07:54
Show Gist options
  • Save neelaryan/f9492791c4b3e6984024f81ed4445cc0 to your computer and use it in GitHub Desktop.
Save neelaryan/f9492791c4b3e6984024f81ed4445cc0 to your computer and use it in GitHub Desktop.
import os
import sys
import random
from tqdm import tqdm
from scipy import misc
import multiprocessing as mp
from collections import defaultdict
PWD = os.getcwd()
print(PWD)
def get_image_paths(folder_name):
print('Getting raw image paths')
subfolders = os.listdir(os.path.join(PWD, folder_name))
image_files = []
for uid in subfolders:
file_names = os.listdir(os.path.join(PWD, folder_name, uid))
image_files.extend([os.path.join(PWD, folder_name, uid, fl) for fl in file_names])
only_jpg = [i for i in image_files if i[-3:] == 'jpg']
return [i for i in only_jpg if i[-5] == 'l']
def process_image(image_path):
tail, basename= os.path.split(image_path)
_, uid = os.path.split(tail)
path_to_write = os.path.join(PWD, 'texts', uid+'_'+basename)
if not os.path.exists(path_to_write):
image = misc.imread(image_path, flatten=True)
image = misc.imresize(image, (25, 25), mode='L')
image = image.flatten()
image = image / 255.0
string = ' '.join(map(str, image))
with open(path_to_write, 'w') as fl:
fl.write(string)
try:
folder_name = sys.argv[1]
except IndexError:
print('''
Usage:
python image_to_csv.py [images_folder_name]
Generates:
---------
- datafile: csv of image pixel values
- infofile: name of image at given index
- train: file containing subset of datafile for training
- train.info: file containing names of sample_som for training
- test: file containing subset of datafile for testing
- test.info: file containing names of sample_som for testing
''')
sys.exit(1)
else:
print(folder_name)
image_paths = get_image_paths(folder_name)
print('Total {} image paths collected'.format(len(image_paths)))
print(set((i[-3:] for i in image_paths)), set((i[-5] for i in image_paths)), 'images collected')
print('Giving to pool')
pool = mp.Pool(mp.cpu_count())
pool.map(process_image, image_paths)
pool.close()
pool.join()
print('Complete')
# - ----------------------------make one data file
print('Making 80% split for train test')
if not os.path.exists('data'):
os.system('rm -rf data')
os.system('mkdir data')
print('Making one data file')
with open('data/datafile', 'w') as datafile:
with open('data/info_file', 'w') as info_file:
with open('data/train', 'w') as train:
with open('data/train.info', 'w') as train_info:
with open('data/test', 'w') as test:
with open('data/test.info', 'w') as test_info:
all_files = os.listdir(os.path.join(PWD, 'texts'))
all_files.sort()
for line in tqdm(all_files, ncols=60):
line_path = os.path.join(PWD, 'texts', line)
with open(line_path, 'r') as fl:
uid, img = line.split('_')
string = '{}\n'.format(fl.read())
datafile.write(string)
info_file.write('{},{}\n'.format(uid, img))
if random.random() < 0.8:
da, inf = train, train_info
else:
da, inf = test, test_info
da.write(string)
inf.write('{},{}\n'.format(uid, img))
else:
print('Datafiles exist')
# ------------------------------------calling somoclu
if not os.path.exists('results/sample.bm'):
command = 'OMP_NUMBER_THREADS=32 somoclu -x 15 -y 15 data/train results/sample'
print(command)
os.system(command)
else:
print('somoclu has already run')
# ------------------------------------ getting subsets of the files
print('Reading training classifications')
with open('results/sample.bm', 'r') as classification:
classes = [i.split(' ') for i in classification.readlines()[2:]]
if not os.path.exists('submaps'):
os.system('mkdir submaps')
os.system('mkdir submaps/data')
os.system('mkdir submaps/maps')
with open('data/train', 'r') as trainfile:
train = trainfile.readlines()
for index, x, y in tqdm(classes, ncols=60):
index, name = int(index), 'submaps/data/{}_{}'.format(x.strip(), y.strip())
with open(name, 'a') as fl:
fl.write(train[index])
# ----------------------------------------running maps for the new subgroups
commands, folders = set(), set()
print('Generating commands for somoclu')
for index, x, y in tqdm(classes, ncols=60):
index, name = int(index), '{}_{}'.format(x.strip(), y.strip())
folders.add(name)
command = 'OMP_NUMBER_THREADS=32 somoclu -x 5 -y 5 submaps/data/{} submaps/maps/{}/map > /dev/null'.format(name, name)
commands.add(command)
if not os.path.exists('submaps/maps'):
os.system('mkdir submaps')
os.system('mkdir submaps/maps')
def generate_maps(command):
os.system(command)
print('Creating needed folders')
for name in tqdm(folders, ncols=60):
os.system('mkdir submaps/maps/{}/ 2> /dev/null'.format(name))
print('Giving to pool')
pool = mp.Pool(mp.cpu_count())
pool.map(generate_maps, list(commands))
pool.close()
pool.join()
print('Done generating maps')
else:
print('Maps are already created')
# ------------------------------------------get average classification per cell
densities = []
for name in tqdm(folders, ncols=60):
with open('submaps/maps/{}/map.bm'.format(name), 'r') as fl:
line_density = len(fl.readlines()) / 35.0
densities.append(line_density)
print('Average image per cell:', sum(densities) / len(densities))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment