neelaryan/script.py

## script.py
import os
import sys
import random
from tqdm import tqdm
from scipy import misc
import multiprocessing as mp
from collections import defaultdict


PWD = os.getcwd()
print(PWD)


def get_image_paths(folder_name):
    print('Getting raw image paths')
    subfolders = os.listdir(os.path.join(PWD, folder_name))
    image_files = []
    for uid in subfolders:
        file_names = os.listdir(os.path.join(PWD, folder_name, uid))
        image_files.extend([os.path.join(PWD, folder_name, uid, fl) for fl in file_names])
    only_jpg = [i for i in image_files if i[-3:] == 'jpg']
    return [i for i in only_jpg if i[-5] == 'l']


def process_image(image_path):
    tail, basename= os.path.split(image_path)
    _, uid = os.path.split(tail)
    path_to_write = os.path.join(PWD, 'texts', uid+'_'+basename)
    if not os.path.exists(path_to_write):
        image = misc.imread(image_path, flatten=True)
        image = misc.imresize(image, (25, 25), mode='L')
        image = image.flatten()
        image = image / 255.0
        string = ' '.join(map(str, image))
        with open(path_to_write, 'w') as fl:
            fl.write(string)


try:
    folder_name = sys.argv[1]
except IndexError:
    print('''
    Usage:
    python image_to_csv.py [images_folder_name]

    Generates:
    ---------
    - datafile: csv of image pixel values
    - infofile: name of image at given index
    - train: file containing subset of datafile for training
    - train.info: file containing names of sample_som for training
    - test: file containing subset of datafile for testing
    - test.info: file containing names of sample_som for testing
    ''')
    sys.exit(1)
else:
    print(folder_name)
    image_paths = get_image_paths(folder_name)
    print('Total {} image paths collected'.format(len(image_paths)))
    print(set((i[-3:] for i in image_paths)), set((i[-5] for i in image_paths)), 'images collected')
    print('Giving to pool')
    pool = mp.Pool(mp.cpu_count())
    pool.map(process_image, image_paths)
    pool.close()
    pool.join()
    print('Complete')


# - ----------------------------make one data file
print('Making 80% split for train test')
if not os.path.exists('data'):
    os.system('rm -rf data')
    os.system('mkdir data')
    print('Making one data file')
    with open('data/datafile', 'w') as datafile:
        with open('data/info_file', 'w') as info_file:
            with open('data/train', 'w') as train:
                with open('data/train.info', 'w') as train_info:
                    with open('data/test', 'w') as test:
                        with open('data/test.info', 'w') as test_info:
                            all_files = os.listdir(os.path.join(PWD, 'texts'))
                            all_files.sort()
                            for line in tqdm(all_files, ncols=60):
                                line_path = os.path.join(PWD, 'texts', line)
                                with open(line_path, 'r') as fl:
                                    uid, img = line.split('_')
                                    string = '{}\n'.format(fl.read())
                                    datafile.write(string)
                                    info_file.write('{},{}\n'.format(uid, img))
                                    if random.random() < 0.8:
                                        da, inf = train, train_info
                                    else:
                                        da, inf = test, test_info
                                    da.write(string)
                                    inf.write('{},{}\n'.format(uid, img))
else:
    print('Datafiles exist')


# ------------------------------------calling somoclu
if not os.path.exists('results/sample.bm'):
    command = 'OMP_NUMBER_THREADS=32 somoclu -x 15 -y 15 data/train results/sample'
    print(command)
    os.system(command)
else:
    print('somoclu has already run')
# ------------------------------------ getting subsets of the files
print('Reading training classifications')
with open('results/sample.bm', 'r') as classification:
    classes = [i.split(' ') for i in classification.readlines()[2:]]
if not os.path.exists('submaps'):
    os.system('mkdir submaps')
    os.system('mkdir submaps/data')
    os.system('mkdir submaps/maps')
    with open('data/train', 'r') as trainfile:
        train = trainfile.readlines()
    for index, x, y in tqdm(classes, ncols=60):
        index, name = int(index), 'submaps/data/{}_{}'.format(x.strip(), y.strip())
        with open(name, 'a') as fl:
            fl.write(train[index])
# ----------------------------------------running maps for the new subgroups
commands, folders = set(), set()
print('Generating commands for somoclu')
for index, x, y in tqdm(classes, ncols=60):
    index, name = int(index), '{}_{}'.format(x.strip(), y.strip())
    folders.add(name)
    command = 'OMP_NUMBER_THREADS=32 somoclu -x 5 -y 5 submaps/data/{} submaps/maps/{}/map > /dev/null'.format(name, name)
    commands.add(command)


if not os.path.exists('submaps/maps'):
    os.system('mkdir submaps')
    os.system('mkdir submaps/maps')
    def generate_maps(command):
        os.system(command)

    print('Creating needed folders')
    for name in tqdm(folders, ncols=60):
        os.system('mkdir submaps/maps/{}/ 2> /dev/null'.format(name))
    print('Giving to pool')
    pool = mp.Pool(mp.cpu_count())
    pool.map(generate_maps, list(commands))
    pool.close()
    pool.join()
    print('Done generating maps')
else:
    print('Maps are already created')
# ------------------------------------------get average classification per cell
densities = []
for name in tqdm(folders, ncols=60):
    with open('submaps/maps/{}/map.bm'.format(name), 'r') as fl:
        line_density = len(fl.readlines()) / 35.0
        densities.append(line_density)
print('Average image per cell:', sum(densities) / len(densities))
	import os
	import sys
	import random
	from tqdm import tqdm
	from scipy import misc
	import multiprocessing as mp
	from collections import defaultdict


	PWD = os.getcwd()
	print(PWD)


	def get_image_paths(folder_name):
	print('Getting raw image paths')
	subfolders = os.listdir(os.path.join(PWD, folder_name))
	image_files = []
	for uid in subfolders:
	file_names = os.listdir(os.path.join(PWD, folder_name, uid))
	image_files.extend([os.path.join(PWD, folder_name, uid, fl) for fl in file_names])
	only_jpg = [i for i in image_files if i[-3:] == 'jpg']
	return [i for i in only_jpg if i[-5] == 'l']


	def process_image(image_path):
	tail, basename= os.path.split(image_path)
	_, uid = os.path.split(tail)
	path_to_write = os.path.join(PWD, 'texts', uid+'_'+basename)
	if not os.path.exists(path_to_write):
	image = misc.imread(image_path, flatten=True)
	image = misc.imresize(image, (25, 25), mode='L')
	image = image.flatten()
	image = image / 255.0
	string = ' '.join(map(str, image))
	with open(path_to_write, 'w') as fl:
	fl.write(string)



	try:
	folder_name = sys.argv[1]
	except IndexError:
	print('''
	Usage:
	python image_to_csv.py [images_folder_name]

	Generates:
	---------
	- datafile: csv of image pixel values
	- infofile: name of image at given index
	- train: file containing subset of datafile for training
	- train.info: file containing names of sample_som for training
	- test: file containing subset of datafile for testing
	- test.info: file containing names of sample_som for testing
	''')
	sys.exit(1)
	else:
	print(folder_name)
	image_paths = get_image_paths(folder_name)
	print('Total {} image paths collected'.format(len(image_paths)))
	print(set((i[-3:] for i in image_paths)), set((i[-5] for i in image_paths)), 'images collected')
	print('Giving to pool')
	pool = mp.Pool(mp.cpu_count())
	pool.map(process_image, image_paths)
	pool.close()
	pool.join()
	print('Complete')


	# - ----------------------------make one data file
	print('Making 80% split for train test')
	if not os.path.exists('data'):
	os.system('rm -rf data')
	os.system('mkdir data')
	print('Making one data file')
	with open('data/datafile', 'w') as datafile:
	with open('data/info_file', 'w') as info_file:
	with open('data/train', 'w') as train:
	with open('data/train.info', 'w') as train_info:
	with open('data/test', 'w') as test:
	with open('data/test.info', 'w') as test_info:
	all_files = os.listdir(os.path.join(PWD, 'texts'))
	all_files.sort()
	for line in tqdm(all_files, ncols=60):
	line_path = os.path.join(PWD, 'texts', line)
	with open(line_path, 'r') as fl:
	uid, img = line.split('_')
	string = '{}\n'.format(fl.read())
	datafile.write(string)
	info_file.write('{},{}\n'.format(uid, img))
	if random.random() < 0.8:
	da, inf = train, train_info
	else:
	da, inf = test, test_info
	da.write(string)
	inf.write('{},{}\n'.format(uid, img))
	else:
	print('Datafiles exist')


	# ------------------------------------calling somoclu
	if not os.path.exists('results/sample.bm'):
	command = 'OMP_NUMBER_THREADS=32 somoclu -x 15 -y 15 data/train results/sample'
	print(command)
	os.system(command)
	else:
	print('somoclu has already run')
	# ------------------------------------ getting subsets of the files
	print('Reading training classifications')
	with open('results/sample.bm', 'r') as classification:
	classes = [i.split(' ') for i in classification.readlines()[2:]]
	if not os.path.exists('submaps'):
	os.system('mkdir submaps')
	os.system('mkdir submaps/data')
	os.system('mkdir submaps/maps')
	with open('data/train', 'r') as trainfile:
	train = trainfile.readlines()
	for index, x, y in tqdm(classes, ncols=60):
	index, name = int(index), 'submaps/data/{}_{}'.format(x.strip(), y.strip())
	with open(name, 'a') as fl:
	fl.write(train[index])
	# ----------------------------------------running maps for the new subgroups
	commands, folders = set(), set()
	print('Generating commands for somoclu')
	for index, x, y in tqdm(classes, ncols=60):
	index, name = int(index), '{}_{}'.format(x.strip(), y.strip())
	folders.add(name)
	command = 'OMP_NUMBER_THREADS=32 somoclu -x 5 -y 5 submaps/data/{} submaps/maps/{}/map > /dev/null'.format(name, name)
	commands.add(command)


	if not os.path.exists('submaps/maps'):
	os.system('mkdir submaps')
	os.system('mkdir submaps/maps')
	def generate_maps(command):
	os.system(command)

	print('Creating needed folders')
	for name in tqdm(folders, ncols=60):
	os.system('mkdir submaps/maps/{}/ 2> /dev/null'.format(name))
	print('Giving to pool')
	pool = mp.Pool(mp.cpu_count())
	pool.map(generate_maps, list(commands))
	pool.close()
	pool.join()
	print('Done generating maps')
	else:
	print('Maps are already created')
	# ------------------------------------------get average classification per cell
	densities = []
	for name in tqdm(folders, ncols=60):
	with open('submaps/maps/{}/map.bm'.format(name), 'r') as fl:
	line_density = len(fl.readlines()) / 35.0
	densities.append(line_density)
	print('Average image per cell:', sum(densities) / len(densities))