Skip to content

Instantly share code, notes, and snippets.

@NP-chaonay
Last active January 14, 2021 08:39
Show Gist options
  • Save NP-chaonay/e35be180524a79ffb133fd644750a03c to your computer and use it in GitHub Desktop.
Save NP-chaonay/e35be180524a79ffb133fd644750a03c to your computer and use it in GitHub Desktop.
TCAS64 Scripts and Snippets for Python

TCAS64 Scripts and Snippets

Remarks: Those codes are written for Python, using popular data science libraries (such as pandas). And you can try this on Google Colab.

Related External Resources By Me

Universities Dataset

Dataset contains information about university entries Related Datasets URL: https://tcas.sgp1.digitaloceanspaces.com/data/universities.json

Initialization

import http.client as http_c
import json
import pandas as pd
conn=http_c.HTTPSConnection('tcas.sgp1.digitaloceanspaces.com')
conn.request('GET','/data/universities.json')
universities_json=conn.getresponse().read()
universities_extracted=json.loads(universities_json)
data=universities_extracted	

Checking

General

# Checking length of each entries, and then check content of maximum-length entry, for dropping out unwanted keys
# In the year 2564 case, only 1 entry has maximum length of keys
data_length=len(data)
lengths=[len(data[i]) for i in range(data_length)]
max_length=max(lengths)
for i,j in enumerate(lengths):
	if j==max_length: print('Max Length at Index #'+str(i))

If data was changed from the last accessing

# Universities Amount
assert data_length==83
# Length of highest-keys entry
assert max_length==340
# Check for entries that have highest-keys
highest_entries_index=[]
for i,j in enumerate(lengths):
	if j==max_length: highest_entries_index+=[i]

assert highest_entries_index==[30]
#
# Check for all available keys on all entries
assert list( pd.DataFrame(data).columns )==['id', 'updated_at', 'created_at', 'university_id', 'university_name', 'university_name_en', 'university_type', 'university_score', 'r1_apply', 'r2_apply', 'r31_apply', 'r32_apply', 'r4_apply', 'r1_accept', 'r2_accept', 'r31_accept', 'r32_accept', 'r4_accept', 'r1_confirm', 'r2_confirm', 'r31_confirm', 'r32_confirm', 'r4_confirm', 'r1_interview_not_pass', 'r2_interview_not_pass', 'r31_interview_not_pass', 'r32_interview_not_pass', 'r4_interview_not_pass', 'r1_selectanother', 'r2_selectanother', 'r31_selectanother', 'r32_selectanother', 'r4_selectanother', 'r1_ignore', 'r2_ignore', 'r31_ignore', 'r32_ignore', 'r4_ignore', 'r1_cancel_p1', 'r1_cancel_p2', 'r1_cancel_p3', 'r1_cancel_p4', 'r2_cancel_p2', 'r2_cancel_p3', 'r2_cancel_p4', 'r31_cancel_p3', 'r31_cancel_p4', 'r32_cancel', 'r4_cancel', 'file_path_1', 'file_path_2', 'file_path_3', 'file_path_4', 'updated_by', 'number_of_course_program', 'number_of_course_major', 'number_of_course_receive', 'number_of_round1_program', 'number_of_round1_major', 'number_of_round1_project', 'number_of_round1_receive', 'number_of_round1_not_project', 'number_of_round1_has_project', 'number_of_round2_program', 'number_of_round2_major', 'number_of_round2_project', 'number_of_round2_receive', 'number_of_round2_not_project', 'number_of_round2_has_project', 'number_of_round31_program', 'number_of_round31_major', 'number_of_round31_project', 'number_of_round31_receive', 'number_of_round31_not_project', 'number_of_round31_has_project', 'number_of_round32_program', 'number_of_round32_major', 'number_of_round32_project', 'number_of_round32_receive', 'number_of_round32_not_project', 'number_of_round32_has_project', 'number_of_round4_program', 'number_of_round4_major', 'number_of_round4_project', 'number_of_round4_receive', 'number_of_round4_not_project', 'number_of_round4_has_project', 'number_of_continuing_program', 'number_of_continuing_major', 'number_of_continuing_receive', 'number_of_bachelor_program', 'number_of_bachelor_major', 'number_of_bachelor_receive', 'number_of_g10_receive_mko2', 'number_of_g11_receive_mko2', 'number_of_g12_receive_mko2', 'number_of_g13_receive_mko2', 'number_of_g14_receive_mko2', 'number_of_g21_receive_mko2', 'number_of_g22_receive_mko2', 'number_of_g30_receive_mko2', 'number_of_g40_receive_mko2', 'number_of_g50_receive_mko2', 'number_of_g61_receive_mko2', 'number_of_g62_receive_mko2', 'number_of_g70_receive_mko2', 'number_of_g80_receive_mko2', 'number_of_g90_receive_mko2', 'number_of_g10r1_program', 'number_of_g11r1_program', 'number_of_g12r1_program', 'number_of_g13r1_program', 'number_of_g14r1_program', 'number_of_g21r1_program', 'number_of_g22r1_program', 'number_of_g30r1_program', 'number_of_g40r1_program', 'number_of_g50r1_program', 'number_of_g61r1_program', 'number_of_g62r1_program', 'number_of_g70r1_program', 'number_of_g80r1_program', 'number_of_g90r1_program', 'number_of_g10r2_program', 'number_of_g11r2_program', 'number_of_g12r2_program', 'number_of_g13r2_program', 'number_of_g14r2_program', 'number_of_g21r2_program', 'number_of_g22r2_program', 'number_of_g30r2_program', 'number_of_g40r2_program', 'number_of_g50r2_program', 'number_of_g61r2_program', 'number_of_g62r2_program', 'number_of_g70r2_program', 'number_of_g80r2_program', 'number_of_g90r2_program', 'number_of_g10r31_program', 'number_of_g11r31_program', 'number_of_g12r31_program', 'number_of_g13r31_program', 'number_of_g14r31_program', 'number_of_g21r31_program', 'number_of_g22r31_program', 'number_of_g30r31_program', 'number_of_g40r31_program', 'number_of_g50r31_program', 'number_of_g61r31_program', 'number_of_g62r31_program', 'number_of_g70r31_program', 'number_of_g80r31_program', 'number_of_g90r31_program', 'number_of_g10r32_program', 'number_of_g11r32_program', 'number_of_g12r32_program', 'number_of_g13r32_program', 'number_of_g14r32_program', 'number_of_g21r32_program', 'number_of_g22r32_program', 'number_of_g30r32_program', 'number_of_g40r32_program', 'number_of_g50r32_program', 'number_of_g61r32_program', 'number_of_g62r32_program', 'number_of_g70r32_program', 'number_of_g80r32_program', 'number_of_g90r32_program', 'number_of_g10r4_program', 'number_of_g11r4_program', 'number_of_g12r4_program', 'number_of_g13r4_program', 'number_of_g14r4_program', 'number_of_g21r4_program', 'number_of_g22r4_program', 'number_of_g30r4_program', 'number_of_g40r4_program', 'number_of_g50r4_program', 'number_of_g61r4_program', 'number_of_g62r4_program', 'number_of_g70r4_program', 'number_of_g80r4_program', 'number_of_g90r4_program', 'number_of_g10r1_major', 'number_of_g11r1_major', 'number_of_g12r1_major', 'number_of_g13r1_major', 'number_of_g14r1_major', 'number_of_g21r1_major', 'number_of_g22r1_major', 'number_of_g30r1_major', 'number_of_g40r1_major', 'number_of_g50r1_major', 'number_of_g61r1_major', 'number_of_g62r1_major', 'number_of_g70r1_major', 'number_of_g80r1_major', 'number_of_g90r1_major', 'number_of_g10r2_major', 'number_of_g11r2_major', 'number_of_g12r2_major', 'number_of_g13r2_major', 'number_of_g14r2_major', 'number_of_g21r2_major', 'number_of_g22r2_major', 'number_of_g30r2_major', 'number_of_g40r2_major', 'number_of_g50r2_major', 'number_of_g61r2_major', 'number_of_g62r2_major', 'number_of_g70r2_major', 'number_of_g80r2_major', 'number_of_g90r2_major', 'number_of_g10r31_major', 'number_of_g11r31_major', 'number_of_g12r31_major', 'number_of_g13r31_major', 'number_of_g14r31_major', 'number_of_g21r31_major', 'number_of_g22r31_major', 'number_of_g30r31_major', 'number_of_g40r31_major', 'number_of_g50r31_major', 'number_of_g61r31_major', 'number_of_g62r31_major', 'number_of_g70r31_major', 'number_of_g80r31_major', 'number_of_g90r31_major', 'number_of_g10r32_major', 'number_of_g11r32_major', 'number_of_g12r32_major', 'number_of_g13r32_major', 'number_of_g14r32_major', 'number_of_g21r32_major', 'number_of_g22r32_major', 'number_of_g30r32_major', 'number_of_g40r32_major', 'number_of_g50r32_major', 'number_of_g61r32_major', 'number_of_g62r32_major', 'number_of_g70r32_major', 'number_of_g80r32_major', 'number_of_g90r32_major', 'number_of_g10r4_major', 'number_of_g11r4_major', 'number_of_g12r4_major', 'number_of_g13r4_major', 'number_of_g14r4_major', 'number_of_g21r4_major', 'number_of_g22r4_major', 'number_of_g30r4_major', 'number_of_g40r4_major', 'number_of_g50r4_major', 'number_of_g61r4_major', 'number_of_g62r4_major', 'number_of_g70r4_major', 'number_of_g80r4_major', 'number_of_g90r4_major', 'number_of_g10r1_receive', 'number_of_g11r1_receive', 'number_of_g12r1_receive', 'number_of_g13r1_receive', 'number_of_g14r1_receive', 'number_of_g21r1_receive', 'number_of_g22r1_receive', 'number_of_g30r1_receive', 'number_of_g40r1_receive', 'number_of_g50r1_receive', 'number_of_g61r1_receive', 'number_of_g62r1_receive', 'number_of_g70r1_receive', 'number_of_g80r1_receive', 'number_of_g90r1_receive', 'number_of_g10r2_receive', 'number_of_g11r2_receive', 'number_of_g12r2_receive', 'number_of_g13r2_receive', 'number_of_g14r2_receive', 'number_of_g21r2_receive', 'number_of_g22r2_receive', 'number_of_g30r2_receive', 'number_of_g40r2_receive', 'number_of_g50r2_receive', 'number_of_g61r2_receive', 'number_of_g62r2_receive', 'number_of_g70r2_receive', 'number_of_g80r2_receive', 'number_of_g90r2_receive', 'number_of_g10r31_receive', 'number_of_g11r31_receive', 'number_of_g12r31_receive', 'number_of_g13r31_receive', 'number_of_g14r31_receive', 'number_of_g21r31_receive', 'number_of_g22r31_receive', 'number_of_g30r31_receive', 'number_of_g40r31_receive', 'number_of_g50r31_receive', 'number_of_g61r31_receive', 'number_of_g62r31_receive', 'number_of_g70r31_receive', 'number_of_g80r31_receive', 'number_of_g90r31_receive', 'number_of_g10r32_receive', 'number_of_g11r32_receive', 'number_of_g12r32_receive', 'number_of_g13r32_receive', 'number_of_g14r32_receive', 'number_of_g21r32_receive', 'number_of_g22r32_receive', 'number_of_g30r32_receive', 'number_of_g40r32_receive', 'number_of_g50r32_receive', 'number_of_g61r32_receive', 'number_of_g62r32_receive', 'number_of_g70r32_receive', 'number_of_g80r32_receive', 'number_of_g90r32_receive', 'number_of_g10r4_receive', 'number_of_g11r4_receive', 'number_of_g12r4_receive', 'number_of_g13r4_receive', 'number_of_g14r4_receive', 'number_of_g21r4_receive', 'number_of_g22r4_receive', 'number_of_g30r4_receive', 'number_of_g40r4_receive', 'number_of_g50r4_receive', 'number_of_g61r4_receive', 'number_of_g62r4_receive', 'number_of_g70r4_receive', 'number_of_g80r4_receive', 'number_of_g90r4_receive', 'number_all_round_program', 'number_all_round_major', 'number_all_round_project', 'number_all_round_has_project', 'number_all_round_not_project', 'file_path_handicap', 'file_path_6']

Pre-Processing Data

universities=pd.DataFrame({
	'id':[data[i].get('id') for i in range(len(data))],
	'code':[data[i].get('university_id') for i in range(len(data))],
	'name':[data[i].get('university_name') for i in range(len(data))],
	'en_name':[data[i].get('university_name_en') for i in range(len(data))],
	'type':pd.Categorical([data[i].get('university_type') for i in range(len(data))]),
	'score':[data[i].get('university_score') for i in range(len(data))],
})
# Adding any keys that has keyword "file"
for i in range(len(data)):
	adding_keys=[]
	for key in data[i].keys():
		if 'file' in key: adding_keys+=[key]
	for key in adding_keys:
		universities.loc[i,key]=data[i][key]
#

universities['created_at']=[data[i].get('created_at') for i in range(len(data))]
universities['updated_at']=[data[i].get('updated_at') for i in range(len(data))]
#

Processing Data

Create variable "universities_isDesired" first. (See on "Misc">"Local Variable Set By User")

# Adding user-customized column
universities['isDesired']=universities_isDesired

Saving data

# Save raw data
open('/tmp/2564_UniversityList_Raw.json','wb').write(universities_json)
# Saving as a new copied Pandas DataFrame and export
universities_readydata=universities.copy()
universities_readydata.to_csv('/tmp/2564_UniversityList_ReadyData.csv',index=False)
#
# Exporting final data
universities.to_csv('/tmp/2564_UniversityList.csv',index=False)

Readback

Ready Data And Final Data

file='2564_UniversityList_ReadyData.csv'
# OR
file='2564_UniversityList.csv'
universities=pd.read_csv(file,dtype={'type':'category','code':str})

Misc

# Get universities name that thier code exist in available_courses_university_code, sorted by "score"
universities[universities.code.apply(lambda x: x in available_courses_university_code)].sort_values('score',ascending=False)[['name','score']]

Courses Dataset

Dataset contains information about course entries Related Datasets URL: https://tcas.sgp1.digitaloceanspaces.com/data/courses.json

Initialization

import http.client as http_c
import json
import pandas as pd
conn=http_c.HTTPSConnection('tcas.sgp1.digitaloceanspaces.com')
conn.request('GET','/data/courses.json')
courses_json=conn.getresponse().read()
courses_extracted=json.loads(courses_json)
data=courses_extracted

Checking

General

# Checking length of each entries, and then check content of maximum-length entry, for dropping out unwanted keys
# In the year 2564 case, 2 entries has maximum length of keys
data_length=len(data)
lengths=[len(data[i]) for i in range(data_length)]
max_length=max(lengths)
for i,j in enumerate(lengths):
	if j==max_length: print('Max Length at Index #'+str(i))
#

# So check if they have exact available keys on both, in this case gives True, so use either of these for inspecting content
assert data[2018].keys()==data[2019].keys()

If data was changed from the last accessing, Also Creating Pandas DataFrame

# Courses Amount
assert data_length==5071
# Length of highest-keys entry
assert max_length==32
# Check for entries that have highest-keys
highest_entries_index=[]
for i,j in enumerate(lengths):
	if j==max_length: highest_entries_index+=[i]

assert highest_entries_index==[2018,2019]
#
# Convert data to Pandas DataFrame, Then check for all available keys on all entries
courses=pd.DataFrame(data)
assert list(courses.columns)==['id', 'updated_at', 'created_at', 'created_by_id', 'university_type_id', 'university_type_name_th', 'university_id', 'university_name_th', 'university_name_en', 'campus_id', 'campus_name_th', 'campus_name_en', 'faculty_id', 'faculty_name_th', 'faculty_name_en', 'group_field_id', 'group_field_th', 'field_id', 'field_name_th', 'field_name_en', 'program_running_number', 'program_name_th', 'program_name_en', 'program_type_id', 'program_type_name_th', 'program_id', 'number_acceptance_mko_2', 'major_id', 'major_name_th', 'institute_partners_th', 'country_partners_th', 'major_acceptance_number', 'major_name_en', 'program_partners_id', 'program_partners_inter_name']

Pre-Processing Data

# Arranging columns
courses=courses[['id','created_by_id','created_at','updated_at','university_id','university_name_th','university_name_en','university_type_id','university_type_name_th','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_id','program_name_th','program_name_en','program_type_id','program_type_name_th','program_running_number','major_id','major_name_th','major_name_en','major_acceptance_number','country_partners_th','institute_partners_th','program_partners_id','program_partners_inter_name','number_acceptance_mko_2',]]
# Convert to category datatype for some columns
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
for col in conv_columns: courses[col]=courses[col].astype('category')

Imitated Searching Engine

In the https://www.mytcas.com/search/ :

  • The search engine search on six of these names:
    • University
    • Campus
    • Faculty
    • Field
    • Major
    • Program
  • Also, whitespace separator acts for multiple searching conditions, which all conditions must be matched, so I using AND operator for this task to work.
  • However the website client limit query to 100 entries, so for my convenience, I have imitated that engine for searching, via Python below:

def search_on_courses_dataset(keyword,dataset=None):
	if dataset is not None: pass
	else: dataset=courses
	keywords=keyword.split()
	final_query=(dataset.index==dataset.index)
	for keyword in keywords:
		universities_search=dataset['university_name_th'].str.find(keyword)>=0
		campuses_search=dataset['campus_name_th'].str.find(keyword)>=0
		faculties_search=dataset['faculty_name_th'].str.find(keyword)>=0
		fields_search=dataset['field_name_th'].str.find(keyword)>=0
		majors_search=dataset['major_name_th'].str.find(keyword)>=0
		programs_search=dataset['program_name_th'].str.find(keyword)>=0
		search=(universities_search|faculties_search|campuses_search|fields_search|majors_search|programs_search)
		final_query&=search
	return final_query

This function returns index of query-matched entries. However I can disable search on university and campus name. (And also faculty name) Into this code below:

def search_on_courses_dataset_v2(keyword,dataset=None):
	if dataset is not None: pass
	else: dataset=courses
	keywords=keyword.split()
	final_query=(dataset.index==dataset.index)
	for keyword in keywords:
		fields_search=dataset['field_name_th'].str.find(keyword)>=0
		majors_search=dataset['major_name_th'].str.find(keyword)>=0
		programs_search=dataset['program_name_th'].str.find(keyword)>=0
		search=(fields_search|majors_search|programs_search)
		final_query&=search
	return final_query

However, this is my another alternative searching method. This prevent matching inputed keyword across all keys' value, this means that inputed keyword must completely match one of any used keys, not just be matched on multiple keys' value. Code is below:

def alt_search_on_courses_dataset(keyword,dataset=None):
	if dataset is not None: pass
	else: dataset=courses
	keywords=keyword.split()
	fields_search=(dataset.index==dataset.index)
	majors_search=(dataset.index==dataset.index)
	programs_search=(dataset.index==dataset.index)
	for keyword in keywords: fields_search&=dataset['field_name_th'].str.find(keyword)>=0
	for keyword in keywords: majors_search&=dataset['major_name_th'].str.find(keyword)>=0
	for keyword in keywords: programs_search&=dataset['program_name_th'].str.find(keyword)>=0
	final_query=(fields_search|majors_search|programs_search)
	return final_query

FYI, given methods are sorted by result amount deacsendingly. Like subsets of search result. Finally, set the default method

search_on_courses_dataset=alt_search_on_courses_dataset

Classify and Get Desired Courses

Create variable "accept_university_code" first. (See on "Misc">"Local Variable Set By User")

# Filtering out unwanted universities
bool=courses['university_id'].apply(lambda x: x in accept_university_code)
courses=courses[bool]
# Multiple Searches
ai_mask=search_on_courses_dataset('ปัญญาประดิษฐ์')
data_mask=search_on_courses_dataset('ข้อมูล')
com_mask=search_on_courses_dataset('คอมพิวเตอร์')
soft_mask=search_on_courses_dataset('ซอฟต์แวร์')
bioinfo_mask=search_on_courses_dataset('ชีว สาร')
stat_mask=search_on_courses_dataset('สถิติ')
info_mask=search_on_courses_dataset('สารสนเทศ')
industmath_mask=search_on_courses_dataset('คณิต อุตสาหก')
math_mask=search_on_courses_dataset('คณิต')
round1_mask=ai_mask|data_mask|com_mask|soft_mask|bioinfo_mask|stat_mask|info_mask|industmath_mask|math_mask
#
# Safely cutting the other entries out
courses=courses[round1_mask]
# Re-match the data indexing
ai_mask=ai_mask[round1_mask]
data_mask=data_mask[round1_mask]
com_mask=com_mask[round1_mask]
soft_mask=soft_mask[round1_mask]
bioinfo_mask=bioinfo_mask[round1_mask]
stat_mask=stat_mask[round1_mask]
info_mask=info_mask[round1_mask]
industmath_mask=industmath_mask[round1_mask]
math_mask=math_mask[round1_mask]
#
# Let do more some filtering
# subset of com_masks
comsci_mask=search_on_courses_dataset('วิทยา คอมพิวเตอร์')
# Create new columns of data
courses['user_tag_1']=None
courses['user_tag_2']=None
courses['user_tag_1']=pd.Categorical(courses['user_tag_1'])
courses['user_tag_2']=pd.Categorical(courses['user_tag_2'])
#
# Some of masks have highly chance to be the high demand course, so let accumulate it into single mask, then tag it
high_demand_mask=comsci_mask|data_mask|ai_mask|soft_mask|bioinfo_mask|stat_mask|industmath_mask
# Some of masks may be very interesting courses or even weird, so let accumulate it into single mask, then tag it
# FYI: defining very_interesting_mask, must be subset of high_demand_mask
very_interesting_mask=bioinfo_mask|industmath_mask|data_mask|ai_mask
# Adjust "user_tag_1"
courses['user_tag_1'].cat.add_categories('HighDemand',inplace=True)
courses['user_tag_1'].cat.add_categories('HighDemandAndInteresting',inplace=True)
courses.loc[high_demand_mask,'user_tag_1']='HighDemand'
courses.loc[very_interesting_mask,'user_tag_1']='HighDemandAndInteresting'
#
# And the others remaining, are going to be checked. Create another new mask for this task.
no_status_mask=~high_demand_mask
# Tag according to masks, ordering by priority.
courses['user_tag_2'].cat.add_categories('info',inplace=True)
courses['user_tag_2'].cat.add_categories('stat',inplace=True)
courses['user_tag_2'].cat.add_categories('bioinfo',inplace=True)
courses['user_tag_2'].cat.add_categories('math',inplace=True)
courses['user_tag_2'].cat.add_categories('soft',inplace=True)
courses['user_tag_2'].cat.add_categories('com',inplace=True)
courses['user_tag_2'].cat.add_categories('industmath',inplace=True)
courses['user_tag_2'].cat.add_categories('data',inplace=True)
courses['user_tag_2'].cat.add_categories('comsci',inplace=True)
courses['user_tag_2'].cat.add_categories('ai',inplace=True)
courses.loc[info_mask,'user_tag_2']='info'
courses.loc[stat_mask,'user_tag_2']='stat'
courses.loc[bioinfo_mask,'user_tag_2']='bioinfo'
courses.loc[math_mask,'user_tag_2']='math'
courses.loc[soft_mask,'user_tag_2']='soft'
courses.loc[com_mask,'user_tag_2']='com'
courses.loc[industmath_mask,'user_tag_2']='industmath'
courses.loc[data_mask,'user_tag_2']='data'
courses.loc[comsci_mask,'user_tag_2']='comsci'
courses.loc[ai_mask,'user_tag_2']='ai'
#
# For Pretty Print (See on "Shared Resources (Used by my other works)" > "Pretty-Print of Pandas_DataFrame")
print_df=courses[high_demand_mask][['university_name_th','university_type_name_th','campus_name_th','faculty_name_th','group_field_th','field_name_th','major_name_th','program_name_th','program_type_name_th','user_tag_1','user_tag_2']]

Saving data

# Save raw data (Not included since data may be changed online)
open('/tmp/2564_MyCoursesList_Raw.json','wb').write(courses_json)
# Saving as a new copied Pandas DataFrame and export
courses_readydata=courses.copy()
courses_readydata.to_csv('/tmp/2564_MyCoursesList_ReadyData.csv',index=False)
#
# Exporting
courses[high_demand_mask].to_csv('/tmp/2564_MyCoursesList.csv',index=False)

Readback

Ready Data

dtypes_to_change={'program_running_number':'str'}
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
for col in conv_columns: dtypes_to_change[col]='category'

courses=pd.read_csv('2564_MyCoursesList_ReadyData.csv',dtype=dtypes_to_change)

Final Data

dtypes_to_change={'program_running_number':'str'}
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
conv_columns+=['user_tag_1','user_tag_2',]
for col in conv_columns: dtypes_to_change[col]='category'

courses=pd.read_csv('2564_MyCoursesList.csv',dtype=dtypes_to_change)

Legacy Data Processing (์Not Used, Outdated, Misleading on something)

Code

# Filenames to read (See Inputted Data A)
names=['ai-degrees','data-degrees','com-degrees','soft-degrees','stat-degrees','info-degrees']
from types import SimpleNamespace
import pandas as pd
files={}
for name in names:
	file=SimpleNamespace(name=name)
	file.lines=open(file.name).read().splitlines()
	## Automatically checking
	print('[Automatically checking]')
	assert len(file.lines)%5==0
	print('Passed')
	print()
	## Manually checking
	print('[Manually checking]',end='')
	print('\n+ Below must be University Name')
	file.lines[0::5][-10:] # Must be University Name
	print('\n+ Below must be Code')
	file.lines[4::5][-10:] # Must be University Code
	print('\n+ Below must be University Name')
	file.lines[5::5][-10:] # Must be University Name
	input('\nOK?')
	print()
	## Saving data
	files[name]=pd.DataFrame({
		'university':pd.Categorical(file.lines[0::5]),
		'faculty':pd.Categorical(file.lines[1::5]),
		'branch':file.lines[2::5],
		'course':file.lines[3::5],
		'university_code':pd.Categorical(file.lines[4::5]),
	})
# Merge every entries from all files and then drop duplicates
df=pd.concat([files[name] for name in names]).drop_duplicates().reset_index(drop=True)
# Set to category datatype
df['university']=pd.Categorical(df['university'])
df['faculty']=pd.Categorical(df['faculty'])
df['university_code']=pd.Categorical(df['university_code'])
#
# Add new column
df['user_tag']=None
df['user_tag']=pd.Categorical(df['user_tag'])
#
# Up to user
df['user_tag'].cat.add_categories(Unknown[0],inplace=True)
df['user_tag'].cat.add_categories(General[0],inplace=True)
df['user_tag'].cat.add_categories(LowDesired[0],inplace=True)
df['user_tag'].cat.add_categories(NoNeed[0],inplace=True)
df['user_tag'].cat.add_categories(Health[0],inplace=True)
df['user_tag'].cat.add_categories(Geo[0],inplace=True)
df['user_tag'].cat.add_categories(Teach[0],inplace=True)
df['user_tag'].cat.add_categories(Lang[0],inplace=True)
#
# Create mask for entries, those university code matches in accept_university_code
bool=df.university_code.apply(lambda x: x in accept_university_code)
# Substitude current with filtered result
df=df[bool].reset_index(drop=True)
# Set user_tag value for each entries
df.loc[NoNeed[1],'user_tag']=NoNeed[0]
df.loc[Health[1],'user_tag']=Health[0]
df.loc[Geo[1],'user_tag']=Geo[0]
df.loc[Teach[1],'user_tag']=Teach[0]
df.loc[Lang[1],'user_tag']=Lang[0]
#
# Exporting
df.to_csv('/tmp/result.csv',index=False)

Inputted Data A

  • For containing dataset on course entries
  • Using simple-mouse-dragging method, copied text from queried search result table of https://www.mytcas.com/search
  • Tested on website system on early 2021
  • Each entries must have 5 lines, consist of
    • university, faculty, branch, program, university_code

Inputted Data Example (2 entries peeked)

มหาวิทยาลัยธุรกิจบัณฑิตย์
วิทยาลัยการแพทย์บูรณาการ
เครื่องสำอาง-ความงาม
วิทยาศาสตรบัณฑิต สาขาวิชาบูรณาการสุขภาพและความงาม (ภาษาไทย ปกติ)
103
มหาวิทยาลัยธุรกิจบัณฑิตย์
วิทยาลัยการแพทย์บูรณาการ
เทคโนโลยีการอาหาร
วิทยาศาสตรบัณฑิต สาขาวิชาการประกอบอาหารเพื่อสุขภาพ (ภาษาไทย ปกติ)
103

Misc

Local Variable Set By User

# Acceptable university markers, on each indice of universities dataset (User selected from 2564_UniversityList.csv)
universities_isDesired=[True, True, True, True, True, False, False, True, False, False, True, False, False, False, True, True, False, False, False, True, False, True, False, False, False, False, False, True, True, False, False, True, True, True, True, False, False, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, False, True, True, True, False, True, False, True, True, True, True, False, False, True, True, False, False, False, True, False, False, True, False, True]
# Acceptable university code
accept_university_code=list(universities[universities_isDesired].code)
# University IDs that consist in my desired courses
university_with_excellent_courses=list(courses[very_interesting_mask].university_id.unique())
university_with_high_demand_courses=list(courses[high_demand_mask].university_id.unique())
available_courses_university_code=list(courses.university_id.unique())
#

Outdated

# Name and Index value for classification entries (Outdated, Only Used In "Courses Datasets">"Legacy Data Processing")
Unknown=['ไม่กำหนดประเภท',[]]
General=['ทั่วไป',[]]
MedDesired=['มีความต้องการปานกลาง',[26, 29, 34, 41, 44, 62, 65, 66, 91, 92, 94, 95, 98, 99, 100, 102, 103, 104, 105, 106, 107, 109, 111, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 133]]
LowDesired=['มีความต้องการน้อย',[14, 20, 21, 23, 25, 28, 31, 32, 33, 38, 40, 42, 45, 46, 47, 48, 51, 52, 53, 54, 58, 61, 67, 77, 81, 82, 87, 96, 97, 101, 108, 110, 112, 115, 134]]
Health=['พิเศษที่สายสุขภาพ',[6,13]]
Geo=['พิเศษที่ภูมิศาสตร์',[93,124,130]]
Teach=['พิเศษที่คุรุศาสตร์',[24]]
Lang=['พิเศษที่ภาษา',[131,132]]
#

Shared Resources (Used by my other works)

Pretty-Print of Pandas_DataFrame

print_df=courses
text=''
cols=print_df.columns
for i in print_df.index:
	msg=' Index ('+str(i)+') '
	char_num_1=len(msg)
	char_num_2=80-char_num_1
	text+='#'*(char_num_2-(char_num_2//2))
	text+=msg
	text+='#'*(char_num_2//2)+'\n'
	for col in cols:
		text+=col+': '+str(print_df[col].loc[i])+'\n'
	text+='\n\n'

open('/tmp/pretty_printed','w').write(text)

Pretty-Print of Dictionary

print_dict=data[30]
text=''
for key,value in print_dict.items():
	text+=str(key)
	text+=': '
	text+=str(value)
	text+='\n'

open('/tmp/pretty_printed','w').write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment