Created
July 22, 2016 07:54
-
-
Save neelaryan/f9492791c4b3e6984024f81ed4445cc0 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import sys | |
import random | |
from tqdm import tqdm | |
from scipy import misc | |
import multiprocessing as mp | |
from collections import defaultdict | |
PWD = os.getcwd() | |
print(PWD) | |
def get_image_paths(folder_name): | |
print('Getting raw image paths') | |
subfolders = os.listdir(os.path.join(PWD, folder_name)) | |
image_files = [] | |
for uid in subfolders: | |
file_names = os.listdir(os.path.join(PWD, folder_name, uid)) | |
image_files.extend([os.path.join(PWD, folder_name, uid, fl) for fl in file_names]) | |
only_jpg = [i for i in image_files if i[-3:] == 'jpg'] | |
return [i for i in only_jpg if i[-5] == 'l'] | |
def process_image(image_path): | |
tail, basename= os.path.split(image_path) | |
_, uid = os.path.split(tail) | |
path_to_write = os.path.join(PWD, 'texts', uid+'_'+basename) | |
if not os.path.exists(path_to_write): | |
image = misc.imread(image_path, flatten=True) | |
image = misc.imresize(image, (25, 25), mode='L') | |
image = image.flatten() | |
image = image / 255.0 | |
string = ' '.join(map(str, image)) | |
with open(path_to_write, 'w') as fl: | |
fl.write(string) | |
try: | |
folder_name = sys.argv[1] | |
except IndexError: | |
print(''' | |
Usage: | |
python image_to_csv.py [images_folder_name] | |
Generates: | |
--------- | |
- datafile: csv of image pixel values | |
- infofile: name of image at given index | |
- train: file containing subset of datafile for training | |
- train.info: file containing names of sample_som for training | |
- test: file containing subset of datafile for testing | |
- test.info: file containing names of sample_som for testing | |
''') | |
sys.exit(1) | |
else: | |
print(folder_name) | |
image_paths = get_image_paths(folder_name) | |
print('Total {} image paths collected'.format(len(image_paths))) | |
print(set((i[-3:] for i in image_paths)), set((i[-5] for i in image_paths)), 'images collected') | |
print('Giving to pool') | |
pool = mp.Pool(mp.cpu_count()) | |
pool.map(process_image, image_paths) | |
pool.close() | |
pool.join() | |
print('Complete') | |
# - ----------------------------make one data file | |
print('Making 80% split for train test') | |
if not os.path.exists('data'): | |
os.system('rm -rf data') | |
os.system('mkdir data') | |
print('Making one data file') | |
with open('data/datafile', 'w') as datafile: | |
with open('data/info_file', 'w') as info_file: | |
with open('data/train', 'w') as train: | |
with open('data/train.info', 'w') as train_info: | |
with open('data/test', 'w') as test: | |
with open('data/test.info', 'w') as test_info: | |
all_files = os.listdir(os.path.join(PWD, 'texts')) | |
all_files.sort() | |
for line in tqdm(all_files, ncols=60): | |
line_path = os.path.join(PWD, 'texts', line) | |
with open(line_path, 'r') as fl: | |
uid, img = line.split('_') | |
string = '{}\n'.format(fl.read()) | |
datafile.write(string) | |
info_file.write('{},{}\n'.format(uid, img)) | |
if random.random() < 0.8: | |
da, inf = train, train_info | |
else: | |
da, inf = test, test_info | |
da.write(string) | |
inf.write('{},{}\n'.format(uid, img)) | |
else: | |
print('Datafiles exist') | |
# ------------------------------------calling somoclu | |
if not os.path.exists('results/sample.bm'): | |
command = 'OMP_NUMBER_THREADS=32 somoclu -x 15 -y 15 data/train results/sample' | |
print(command) | |
os.system(command) | |
else: | |
print('somoclu has already run') | |
# ------------------------------------ getting subsets of the files | |
print('Reading training classifications') | |
with open('results/sample.bm', 'r') as classification: | |
classes = [i.split(' ') for i in classification.readlines()[2:]] | |
if not os.path.exists('submaps'): | |
os.system('mkdir submaps') | |
os.system('mkdir submaps/data') | |
os.system('mkdir submaps/maps') | |
with open('data/train', 'r') as trainfile: | |
train = trainfile.readlines() | |
for index, x, y in tqdm(classes, ncols=60): | |
index, name = int(index), 'submaps/data/{}_{}'.format(x.strip(), y.strip()) | |
with open(name, 'a') as fl: | |
fl.write(train[index]) | |
# ----------------------------------------running maps for the new subgroups | |
commands, folders = set(), set() | |
print('Generating commands for somoclu') | |
for index, x, y in tqdm(classes, ncols=60): | |
index, name = int(index), '{}_{}'.format(x.strip(), y.strip()) | |
folders.add(name) | |
command = 'OMP_NUMBER_THREADS=32 somoclu -x 5 -y 5 submaps/data/{} submaps/maps/{}/map > /dev/null'.format(name, name) | |
commands.add(command) | |
if not os.path.exists('submaps/maps'): | |
os.system('mkdir submaps') | |
os.system('mkdir submaps/maps') | |
def generate_maps(command): | |
os.system(command) | |
print('Creating needed folders') | |
for name in tqdm(folders, ncols=60): | |
os.system('mkdir submaps/maps/{}/ 2> /dev/null'.format(name)) | |
print('Giving to pool') | |
pool = mp.Pool(mp.cpu_count()) | |
pool.map(generate_maps, list(commands)) | |
pool.close() | |
pool.join() | |
print('Done generating maps') | |
else: | |
print('Maps are already created') | |
# ------------------------------------------get average classification per cell | |
densities = [] | |
for name in tqdm(folders, ncols=60): | |
with open('submaps/maps/{}/map.bm'.format(name), 'r') as fl: | |
line_density = len(fl.readlines()) / 35.0 | |
densities.append(line_density) | |
print('Average image per cell:', sum(densities) / len(densities)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment