Remarks: Those codes are written for Python, using popular data science libraries (such as pandas). And you can try this on Google Colab.
Dataset contains information about university entries Related Datasets URL: https://tcas.sgp1.digitaloceanspaces.com/data/universities.json
import http.client as http_c
import json
import pandas as pd
conn=http_c.HTTPSConnection('tcas.sgp1.digitaloceanspaces.com')
conn.request('GET','/data/universities.json')
universities_json=conn.getresponse().read()
universities_extracted=json.loads(universities_json)
data=universities_extracted
# Checking length of each entries, and then check content of maximum-length entry, for dropping out unwanted keys
# In the year 2564 case, only 1 entry has maximum length of keys
data_length=len(data)
lengths=[len(data[i]) for i in range(data_length)]
max_length=max(lengths)
for i,j in enumerate(lengths):
if j==max_length: print('Max Length at Index #'+str(i))
# Universities Amount
assert data_length==83
# Length of highest-keys entry
assert max_length==340
# Check for entries that have highest-keys
highest_entries_index=[]
for i,j in enumerate(lengths):
if j==max_length: highest_entries_index+=[i]
assert highest_entries_index==[30]
#
# Check for all available keys on all entries
assert list( pd.DataFrame(data).columns )==['id', 'updated_at', 'created_at', 'university_id', 'university_name', 'university_name_en', 'university_type', 'university_score', 'r1_apply', 'r2_apply', 'r31_apply', 'r32_apply', 'r4_apply', 'r1_accept', 'r2_accept', 'r31_accept', 'r32_accept', 'r4_accept', 'r1_confirm', 'r2_confirm', 'r31_confirm', 'r32_confirm', 'r4_confirm', 'r1_interview_not_pass', 'r2_interview_not_pass', 'r31_interview_not_pass', 'r32_interview_not_pass', 'r4_interview_not_pass', 'r1_selectanother', 'r2_selectanother', 'r31_selectanother', 'r32_selectanother', 'r4_selectanother', 'r1_ignore', 'r2_ignore', 'r31_ignore', 'r32_ignore', 'r4_ignore', 'r1_cancel_p1', 'r1_cancel_p2', 'r1_cancel_p3', 'r1_cancel_p4', 'r2_cancel_p2', 'r2_cancel_p3', 'r2_cancel_p4', 'r31_cancel_p3', 'r31_cancel_p4', 'r32_cancel', 'r4_cancel', 'file_path_1', 'file_path_2', 'file_path_3', 'file_path_4', 'updated_by', 'number_of_course_program', 'number_of_course_major', 'number_of_course_receive', 'number_of_round1_program', 'number_of_round1_major', 'number_of_round1_project', 'number_of_round1_receive', 'number_of_round1_not_project', 'number_of_round1_has_project', 'number_of_round2_program', 'number_of_round2_major', 'number_of_round2_project', 'number_of_round2_receive', 'number_of_round2_not_project', 'number_of_round2_has_project', 'number_of_round31_program', 'number_of_round31_major', 'number_of_round31_project', 'number_of_round31_receive', 'number_of_round31_not_project', 'number_of_round31_has_project', 'number_of_round32_program', 'number_of_round32_major', 'number_of_round32_project', 'number_of_round32_receive', 'number_of_round32_not_project', 'number_of_round32_has_project', 'number_of_round4_program', 'number_of_round4_major', 'number_of_round4_project', 'number_of_round4_receive', 'number_of_round4_not_project', 'number_of_round4_has_project', 'number_of_continuing_program', 'number_of_continuing_major', 'number_of_continuing_receive', 'number_of_bachelor_program', 'number_of_bachelor_major', 'number_of_bachelor_receive', 'number_of_g10_receive_mko2', 'number_of_g11_receive_mko2', 'number_of_g12_receive_mko2', 'number_of_g13_receive_mko2', 'number_of_g14_receive_mko2', 'number_of_g21_receive_mko2', 'number_of_g22_receive_mko2', 'number_of_g30_receive_mko2', 'number_of_g40_receive_mko2', 'number_of_g50_receive_mko2', 'number_of_g61_receive_mko2', 'number_of_g62_receive_mko2', 'number_of_g70_receive_mko2', 'number_of_g80_receive_mko2', 'number_of_g90_receive_mko2', 'number_of_g10r1_program', 'number_of_g11r1_program', 'number_of_g12r1_program', 'number_of_g13r1_program', 'number_of_g14r1_program', 'number_of_g21r1_program', 'number_of_g22r1_program', 'number_of_g30r1_program', 'number_of_g40r1_program', 'number_of_g50r1_program', 'number_of_g61r1_program', 'number_of_g62r1_program', 'number_of_g70r1_program', 'number_of_g80r1_program', 'number_of_g90r1_program', 'number_of_g10r2_program', 'number_of_g11r2_program', 'number_of_g12r2_program', 'number_of_g13r2_program', 'number_of_g14r2_program', 'number_of_g21r2_program', 'number_of_g22r2_program', 'number_of_g30r2_program', 'number_of_g40r2_program', 'number_of_g50r2_program', 'number_of_g61r2_program', 'number_of_g62r2_program', 'number_of_g70r2_program', 'number_of_g80r2_program', 'number_of_g90r2_program', 'number_of_g10r31_program', 'number_of_g11r31_program', 'number_of_g12r31_program', 'number_of_g13r31_program', 'number_of_g14r31_program', 'number_of_g21r31_program', 'number_of_g22r31_program', 'number_of_g30r31_program', 'number_of_g40r31_program', 'number_of_g50r31_program', 'number_of_g61r31_program', 'number_of_g62r31_program', 'number_of_g70r31_program', 'number_of_g80r31_program', 'number_of_g90r31_program', 'number_of_g10r32_program', 'number_of_g11r32_program', 'number_of_g12r32_program', 'number_of_g13r32_program', 'number_of_g14r32_program', 'number_of_g21r32_program', 'number_of_g22r32_program', 'number_of_g30r32_program', 'number_of_g40r32_program', 'number_of_g50r32_program', 'number_of_g61r32_program', 'number_of_g62r32_program', 'number_of_g70r32_program', 'number_of_g80r32_program', 'number_of_g90r32_program', 'number_of_g10r4_program', 'number_of_g11r4_program', 'number_of_g12r4_program', 'number_of_g13r4_program', 'number_of_g14r4_program', 'number_of_g21r4_program', 'number_of_g22r4_program', 'number_of_g30r4_program', 'number_of_g40r4_program', 'number_of_g50r4_program', 'number_of_g61r4_program', 'number_of_g62r4_program', 'number_of_g70r4_program', 'number_of_g80r4_program', 'number_of_g90r4_program', 'number_of_g10r1_major', 'number_of_g11r1_major', 'number_of_g12r1_major', 'number_of_g13r1_major', 'number_of_g14r1_major', 'number_of_g21r1_major', 'number_of_g22r1_major', 'number_of_g30r1_major', 'number_of_g40r1_major', 'number_of_g50r1_major', 'number_of_g61r1_major', 'number_of_g62r1_major', 'number_of_g70r1_major', 'number_of_g80r1_major', 'number_of_g90r1_major', 'number_of_g10r2_major', 'number_of_g11r2_major', 'number_of_g12r2_major', 'number_of_g13r2_major', 'number_of_g14r2_major', 'number_of_g21r2_major', 'number_of_g22r2_major', 'number_of_g30r2_major', 'number_of_g40r2_major', 'number_of_g50r2_major', 'number_of_g61r2_major', 'number_of_g62r2_major', 'number_of_g70r2_major', 'number_of_g80r2_major', 'number_of_g90r2_major', 'number_of_g10r31_major', 'number_of_g11r31_major', 'number_of_g12r31_major', 'number_of_g13r31_major', 'number_of_g14r31_major', 'number_of_g21r31_major', 'number_of_g22r31_major', 'number_of_g30r31_major', 'number_of_g40r31_major', 'number_of_g50r31_major', 'number_of_g61r31_major', 'number_of_g62r31_major', 'number_of_g70r31_major', 'number_of_g80r31_major', 'number_of_g90r31_major', 'number_of_g10r32_major', 'number_of_g11r32_major', 'number_of_g12r32_major', 'number_of_g13r32_major', 'number_of_g14r32_major', 'number_of_g21r32_major', 'number_of_g22r32_major', 'number_of_g30r32_major', 'number_of_g40r32_major', 'number_of_g50r32_major', 'number_of_g61r32_major', 'number_of_g62r32_major', 'number_of_g70r32_major', 'number_of_g80r32_major', 'number_of_g90r32_major', 'number_of_g10r4_major', 'number_of_g11r4_major', 'number_of_g12r4_major', 'number_of_g13r4_major', 'number_of_g14r4_major', 'number_of_g21r4_major', 'number_of_g22r4_major', 'number_of_g30r4_major', 'number_of_g40r4_major', 'number_of_g50r4_major', 'number_of_g61r4_major', 'number_of_g62r4_major', 'number_of_g70r4_major', 'number_of_g80r4_major', 'number_of_g90r4_major', 'number_of_g10r1_receive', 'number_of_g11r1_receive', 'number_of_g12r1_receive', 'number_of_g13r1_receive', 'number_of_g14r1_receive', 'number_of_g21r1_receive', 'number_of_g22r1_receive', 'number_of_g30r1_receive', 'number_of_g40r1_receive', 'number_of_g50r1_receive', 'number_of_g61r1_receive', 'number_of_g62r1_receive', 'number_of_g70r1_receive', 'number_of_g80r1_receive', 'number_of_g90r1_receive', 'number_of_g10r2_receive', 'number_of_g11r2_receive', 'number_of_g12r2_receive', 'number_of_g13r2_receive', 'number_of_g14r2_receive', 'number_of_g21r2_receive', 'number_of_g22r2_receive', 'number_of_g30r2_receive', 'number_of_g40r2_receive', 'number_of_g50r2_receive', 'number_of_g61r2_receive', 'number_of_g62r2_receive', 'number_of_g70r2_receive', 'number_of_g80r2_receive', 'number_of_g90r2_receive', 'number_of_g10r31_receive', 'number_of_g11r31_receive', 'number_of_g12r31_receive', 'number_of_g13r31_receive', 'number_of_g14r31_receive', 'number_of_g21r31_receive', 'number_of_g22r31_receive', 'number_of_g30r31_receive', 'number_of_g40r31_receive', 'number_of_g50r31_receive', 'number_of_g61r31_receive', 'number_of_g62r31_receive', 'number_of_g70r31_receive', 'number_of_g80r31_receive', 'number_of_g90r31_receive', 'number_of_g10r32_receive', 'number_of_g11r32_receive', 'number_of_g12r32_receive', 'number_of_g13r32_receive', 'number_of_g14r32_receive', 'number_of_g21r32_receive', 'number_of_g22r32_receive', 'number_of_g30r32_receive', 'number_of_g40r32_receive', 'number_of_g50r32_receive', 'number_of_g61r32_receive', 'number_of_g62r32_receive', 'number_of_g70r32_receive', 'number_of_g80r32_receive', 'number_of_g90r32_receive', 'number_of_g10r4_receive', 'number_of_g11r4_receive', 'number_of_g12r4_receive', 'number_of_g13r4_receive', 'number_of_g14r4_receive', 'number_of_g21r4_receive', 'number_of_g22r4_receive', 'number_of_g30r4_receive', 'number_of_g40r4_receive', 'number_of_g50r4_receive', 'number_of_g61r4_receive', 'number_of_g62r4_receive', 'number_of_g70r4_receive', 'number_of_g80r4_receive', 'number_of_g90r4_receive', 'number_all_round_program', 'number_all_round_major', 'number_all_round_project', 'number_all_round_has_project', 'number_all_round_not_project', 'file_path_handicap', 'file_path_6']
universities=pd.DataFrame({
'id':[data[i].get('id') for i in range(len(data))],
'code':[data[i].get('university_id') for i in range(len(data))],
'name':[data[i].get('university_name') for i in range(len(data))],
'en_name':[data[i].get('university_name_en') for i in range(len(data))],
'type':pd.Categorical([data[i].get('university_type') for i in range(len(data))]),
'score':[data[i].get('university_score') for i in range(len(data))],
})
# Adding any keys that has keyword "file"
for i in range(len(data)):
adding_keys=[]
for key in data[i].keys():
if 'file' in key: adding_keys+=[key]
for key in adding_keys:
universities.loc[i,key]=data[i][key]
#
universities['created_at']=[data[i].get('created_at') for i in range(len(data))]
universities['updated_at']=[data[i].get('updated_at') for i in range(len(data))]
#
Create variable "universities_isDesired" first. (See on "Misc">"Local Variable Set By User")
# Adding user-customized column
universities['isDesired']=universities_isDesired
# Save raw data
open('/tmp/2564_UniversityList_Raw.json','wb').write(universities_json)
# Saving as a new copied Pandas DataFrame and export
universities_readydata=universities.copy()
universities_readydata.to_csv('/tmp/2564_UniversityList_ReadyData.csv',index=False)
#
# Exporting final data
universities.to_csv('/tmp/2564_UniversityList.csv',index=False)
file='2564_UniversityList_ReadyData.csv'
# OR
file='2564_UniversityList.csv'
universities=pd.read_csv(file,dtype={'type':'category','code':str})
# Get universities name that thier code exist in available_courses_university_code, sorted by "score"
universities[universities.code.apply(lambda x: x in available_courses_university_code)].sort_values('score',ascending=False)[['name','score']]
Dataset contains information about course entries Related Datasets URL: https://tcas.sgp1.digitaloceanspaces.com/data/courses.json
import http.client as http_c
import json
import pandas as pd
conn=http_c.HTTPSConnection('tcas.sgp1.digitaloceanspaces.com')
conn.request('GET','/data/courses.json')
courses_json=conn.getresponse().read()
courses_extracted=json.loads(courses_json)
data=courses_extracted
# Checking length of each entries, and then check content of maximum-length entry, for dropping out unwanted keys
# In the year 2564 case, 2 entries has maximum length of keys
data_length=len(data)
lengths=[len(data[i]) for i in range(data_length)]
max_length=max(lengths)
for i,j in enumerate(lengths):
if j==max_length: print('Max Length at Index #'+str(i))
#
# So check if they have exact available keys on both, in this case gives True, so use either of these for inspecting content
assert data[2018].keys()==data[2019].keys()
# Courses Amount
assert data_length==5071
# Length of highest-keys entry
assert max_length==32
# Check for entries that have highest-keys
highest_entries_index=[]
for i,j in enumerate(lengths):
if j==max_length: highest_entries_index+=[i]
assert highest_entries_index==[2018,2019]
#
# Convert data to Pandas DataFrame, Then check for all available keys on all entries
courses=pd.DataFrame(data)
assert list(courses.columns)==['id', 'updated_at', 'created_at', 'created_by_id', 'university_type_id', 'university_type_name_th', 'university_id', 'university_name_th', 'university_name_en', 'campus_id', 'campus_name_th', 'campus_name_en', 'faculty_id', 'faculty_name_th', 'faculty_name_en', 'group_field_id', 'group_field_th', 'field_id', 'field_name_th', 'field_name_en', 'program_running_number', 'program_name_th', 'program_name_en', 'program_type_id', 'program_type_name_th', 'program_id', 'number_acceptance_mko_2', 'major_id', 'major_name_th', 'institute_partners_th', 'country_partners_th', 'major_acceptance_number', 'major_name_en', 'program_partners_id', 'program_partners_inter_name']
# Arranging columns
courses=courses[['id','created_by_id','created_at','updated_at','university_id','university_name_th','university_name_en','university_type_id','university_type_name_th','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_id','program_name_th','program_name_en','program_type_id','program_type_name_th','program_running_number','major_id','major_name_th','major_name_en','major_acceptance_number','country_partners_th','institute_partners_th','program_partners_id','program_partners_inter_name','number_acceptance_mko_2',]]
# Convert to category datatype for some columns
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
for col in conv_columns: courses[col]=courses[col].astype('category')
In the https://www.mytcas.com/search/ :
- The search engine search on six of these names:
- University
- Campus
- Faculty
- Field
- Major
- Program
- Also, whitespace separator acts for multiple searching conditions, which all conditions must be matched, so I using AND operator for this task to work.
- However the website client limit query to 100 entries, so for my convenience, I have imitated that engine for searching, via Python below:
def search_on_courses_dataset(keyword,dataset=None):
if dataset is not None: pass
else: dataset=courses
keywords=keyword.split()
final_query=(dataset.index==dataset.index)
for keyword in keywords:
universities_search=dataset['university_name_th'].str.find(keyword)>=0
campuses_search=dataset['campus_name_th'].str.find(keyword)>=0
faculties_search=dataset['faculty_name_th'].str.find(keyword)>=0
fields_search=dataset['field_name_th'].str.find(keyword)>=0
majors_search=dataset['major_name_th'].str.find(keyword)>=0
programs_search=dataset['program_name_th'].str.find(keyword)>=0
search=(universities_search|faculties_search|campuses_search|fields_search|majors_search|programs_search)
final_query&=search
return final_query
This function returns index of query-matched entries. However I can disable search on university and campus name. (And also faculty name) Into this code below:
def search_on_courses_dataset_v2(keyword,dataset=None):
if dataset is not None: pass
else: dataset=courses
keywords=keyword.split()
final_query=(dataset.index==dataset.index)
for keyword in keywords:
fields_search=dataset['field_name_th'].str.find(keyword)>=0
majors_search=dataset['major_name_th'].str.find(keyword)>=0
programs_search=dataset['program_name_th'].str.find(keyword)>=0
search=(fields_search|majors_search|programs_search)
final_query&=search
return final_query
However, this is my another alternative searching method. This prevent matching inputed keyword across all keys' value, this means that inputed keyword must completely match one of any used keys, not just be matched on multiple keys' value. Code is below:
def alt_search_on_courses_dataset(keyword,dataset=None):
if dataset is not None: pass
else: dataset=courses
keywords=keyword.split()
fields_search=(dataset.index==dataset.index)
majors_search=(dataset.index==dataset.index)
programs_search=(dataset.index==dataset.index)
for keyword in keywords: fields_search&=dataset['field_name_th'].str.find(keyword)>=0
for keyword in keywords: majors_search&=dataset['major_name_th'].str.find(keyword)>=0
for keyword in keywords: programs_search&=dataset['program_name_th'].str.find(keyword)>=0
final_query=(fields_search|majors_search|programs_search)
return final_query
FYI, given methods are sorted by result amount deacsendingly. Like subsets of search result. Finally, set the default method
search_on_courses_dataset=alt_search_on_courses_dataset
Create variable "accept_university_code" first. (See on "Misc">"Local Variable Set By User")
# Filtering out unwanted universities
bool=courses['university_id'].apply(lambda x: x in accept_university_code)
courses=courses[bool]
# Multiple Searches
ai_mask=search_on_courses_dataset('ปัญญาประดิษฐ์')
data_mask=search_on_courses_dataset('ข้อมูล')
com_mask=search_on_courses_dataset('คอมพิวเตอร์')
soft_mask=search_on_courses_dataset('ซอฟต์แวร์')
bioinfo_mask=search_on_courses_dataset('ชีว สาร')
stat_mask=search_on_courses_dataset('สถิติ')
info_mask=search_on_courses_dataset('สารสนเทศ')
industmath_mask=search_on_courses_dataset('คณิต อุตสาหก')
math_mask=search_on_courses_dataset('คณิต')
round1_mask=ai_mask|data_mask|com_mask|soft_mask|bioinfo_mask|stat_mask|info_mask|industmath_mask|math_mask
#
# Safely cutting the other entries out
courses=courses[round1_mask]
# Re-match the data indexing
ai_mask=ai_mask[round1_mask]
data_mask=data_mask[round1_mask]
com_mask=com_mask[round1_mask]
soft_mask=soft_mask[round1_mask]
bioinfo_mask=bioinfo_mask[round1_mask]
stat_mask=stat_mask[round1_mask]
info_mask=info_mask[round1_mask]
industmath_mask=industmath_mask[round1_mask]
math_mask=math_mask[round1_mask]
#
# Let do more some filtering
# subset of com_masks
comsci_mask=search_on_courses_dataset('วิทยา คอมพิวเตอร์')
# Create new columns of data
courses['user_tag_1']=None
courses['user_tag_2']=None
courses['user_tag_1']=pd.Categorical(courses['user_tag_1'])
courses['user_tag_2']=pd.Categorical(courses['user_tag_2'])
#
# Some of masks have highly chance to be the high demand course, so let accumulate it into single mask, then tag it
high_demand_mask=comsci_mask|data_mask|ai_mask|soft_mask|bioinfo_mask|stat_mask|industmath_mask
# Some of masks may be very interesting courses or even weird, so let accumulate it into single mask, then tag it
# FYI: defining very_interesting_mask, must be subset of high_demand_mask
very_interesting_mask=bioinfo_mask|industmath_mask|data_mask|ai_mask
# Adjust "user_tag_1"
courses['user_tag_1'].cat.add_categories('HighDemand',inplace=True)
courses['user_tag_1'].cat.add_categories('HighDemandAndInteresting',inplace=True)
courses.loc[high_demand_mask,'user_tag_1']='HighDemand'
courses.loc[very_interesting_mask,'user_tag_1']='HighDemandAndInteresting'
#
# And the others remaining, are going to be checked. Create another new mask for this task.
no_status_mask=~high_demand_mask
# Tag according to masks, ordering by priority.
courses['user_tag_2'].cat.add_categories('info',inplace=True)
courses['user_tag_2'].cat.add_categories('stat',inplace=True)
courses['user_tag_2'].cat.add_categories('bioinfo',inplace=True)
courses['user_tag_2'].cat.add_categories('math',inplace=True)
courses['user_tag_2'].cat.add_categories('soft',inplace=True)
courses['user_tag_2'].cat.add_categories('com',inplace=True)
courses['user_tag_2'].cat.add_categories('industmath',inplace=True)
courses['user_tag_2'].cat.add_categories('data',inplace=True)
courses['user_tag_2'].cat.add_categories('comsci',inplace=True)
courses['user_tag_2'].cat.add_categories('ai',inplace=True)
courses.loc[info_mask,'user_tag_2']='info'
courses.loc[stat_mask,'user_tag_2']='stat'
courses.loc[bioinfo_mask,'user_tag_2']='bioinfo'
courses.loc[math_mask,'user_tag_2']='math'
courses.loc[soft_mask,'user_tag_2']='soft'
courses.loc[com_mask,'user_tag_2']='com'
courses.loc[industmath_mask,'user_tag_2']='industmath'
courses.loc[data_mask,'user_tag_2']='data'
courses.loc[comsci_mask,'user_tag_2']='comsci'
courses.loc[ai_mask,'user_tag_2']='ai'
#
# For Pretty Print (See on "Shared Resources (Used by my other works)" > "Pretty-Print of Pandas_DataFrame")
print_df=courses[high_demand_mask][['university_name_th','university_type_name_th','campus_name_th','faculty_name_th','group_field_th','field_name_th','major_name_th','program_name_th','program_type_name_th','user_tag_1','user_tag_2']]
# Save raw data (Not included since data may be changed online)
open('/tmp/2564_MyCoursesList_Raw.json','wb').write(courses_json)
# Saving as a new copied Pandas DataFrame and export
courses_readydata=courses.copy()
courses_readydata.to_csv('/tmp/2564_MyCoursesList_ReadyData.csv',index=False)
#
# Exporting
courses[high_demand_mask].to_csv('/tmp/2564_MyCoursesList.csv',index=False)
dtypes_to_change={'program_running_number':'str'}
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
for col in conv_columns: dtypes_to_change[col]='category'
courses=pd.read_csv('2564_MyCoursesList_ReadyData.csv',dtype=dtypes_to_change)
dtypes_to_change={'program_running_number':'str'}
conv_columns=['university_type_id','university_type_name_th','university_id','university_name_th','university_name_en','campus_id','campus_name_th','campus_name_en','faculty_id','faculty_name_th','faculty_name_en','group_field_id','group_field_th','field_id','field_name_th','field_name_en','program_type_id','program_type_name_th','institute_partners_th','country_partners_th']
conv_columns+=['user_tag_1','user_tag_2',]
for col in conv_columns: dtypes_to_change[col]='category'
courses=pd.read_csv('2564_MyCoursesList.csv',dtype=dtypes_to_change)
# Filenames to read (See Inputted Data A)
names=['ai-degrees','data-degrees','com-degrees','soft-degrees','stat-degrees','info-degrees']
from types import SimpleNamespace
import pandas as pd
files={}
for name in names:
file=SimpleNamespace(name=name)
file.lines=open(file.name).read().splitlines()
## Automatically checking
print('[Automatically checking]')
assert len(file.lines)%5==0
print('Passed')
print()
## Manually checking
print('[Manually checking]',end='')
print('\n+ Below must be University Name')
file.lines[0::5][-10:] # Must be University Name
print('\n+ Below must be Code')
file.lines[4::5][-10:] # Must be University Code
print('\n+ Below must be University Name')
file.lines[5::5][-10:] # Must be University Name
input('\nOK?')
print()
## Saving data
files[name]=pd.DataFrame({
'university':pd.Categorical(file.lines[0::5]),
'faculty':pd.Categorical(file.lines[1::5]),
'branch':file.lines[2::5],
'course':file.lines[3::5],
'university_code':pd.Categorical(file.lines[4::5]),
})
# Merge every entries from all files and then drop duplicates
df=pd.concat([files[name] for name in names]).drop_duplicates().reset_index(drop=True)
# Set to category datatype
df['university']=pd.Categorical(df['university'])
df['faculty']=pd.Categorical(df['faculty'])
df['university_code']=pd.Categorical(df['university_code'])
#
# Add new column
df['user_tag']=None
df['user_tag']=pd.Categorical(df['user_tag'])
#
# Up to user
df['user_tag'].cat.add_categories(Unknown[0],inplace=True)
df['user_tag'].cat.add_categories(General[0],inplace=True)
df['user_tag'].cat.add_categories(LowDesired[0],inplace=True)
df['user_tag'].cat.add_categories(NoNeed[0],inplace=True)
df['user_tag'].cat.add_categories(Health[0],inplace=True)
df['user_tag'].cat.add_categories(Geo[0],inplace=True)
df['user_tag'].cat.add_categories(Teach[0],inplace=True)
df['user_tag'].cat.add_categories(Lang[0],inplace=True)
#
# Create mask for entries, those university code matches in accept_university_code
bool=df.university_code.apply(lambda x: x in accept_university_code)
# Substitude current with filtered result
df=df[bool].reset_index(drop=True)
# Set user_tag value for each entries
df.loc[NoNeed[1],'user_tag']=NoNeed[0]
df.loc[Health[1],'user_tag']=Health[0]
df.loc[Geo[1],'user_tag']=Geo[0]
df.loc[Teach[1],'user_tag']=Teach[0]
df.loc[Lang[1],'user_tag']=Lang[0]
#
# Exporting
df.to_csv('/tmp/result.csv',index=False)
- For containing dataset on course entries
- Using simple-mouse-dragging method, copied text from queried search result table of https://www.mytcas.com/search
- Tested on website system on early 2021
- Each entries must have 5 lines, consist of
- university, faculty, branch, program, university_code
มหาวิทยาลัยธุรกิจบัณฑิตย์
วิทยาลัยการแพทย์บูรณาการ
เครื่องสำอาง-ความงาม
วิทยาศาสตรบัณฑิต สาขาวิชาบูรณาการสุขภาพและความงาม (ภาษาไทย ปกติ)
103
มหาวิทยาลัยธุรกิจบัณฑิตย์
วิทยาลัยการแพทย์บูรณาการ
เทคโนโลยีการอาหาร
วิทยาศาสตรบัณฑิต สาขาวิชาการประกอบอาหารเพื่อสุขภาพ (ภาษาไทย ปกติ)
103
# Acceptable university markers, on each indice of universities dataset (User selected from 2564_UniversityList.csv)
universities_isDesired=[True, True, True, True, True, False, False, True, False, False, True, False, False, False, True, True, False, False, False, True, False, True, False, False, False, False, False, True, True, False, False, True, True, True, True, False, False, True, True, True, False, False, False, False, False, False, False, False, False, False, False, False, True, True, True, True, True, True, True, False, True, True, True, False, True, False, True, True, True, True, False, False, True, True, False, False, False, True, False, False, True, False, True]
# Acceptable university code
accept_university_code=list(universities[universities_isDesired].code)
# University IDs that consist in my desired courses
university_with_excellent_courses=list(courses[very_interesting_mask].university_id.unique())
university_with_high_demand_courses=list(courses[high_demand_mask].university_id.unique())
available_courses_university_code=list(courses.university_id.unique())
#
# Name and Index value for classification entries (Outdated, Only Used In "Courses Datasets">"Legacy Data Processing")
Unknown=['ไม่กำหนดประเภท',[]]
General=['ทั่วไป',[]]
MedDesired=['มีความต้องการปานกลาง',[26, 29, 34, 41, 44, 62, 65, 66, 91, 92, 94, 95, 98, 99, 100, 102, 103, 104, 105, 106, 107, 109, 111, 113, 114, 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 133]]
LowDesired=['มีความต้องการน้อย',[14, 20, 21, 23, 25, 28, 31, 32, 33, 38, 40, 42, 45, 46, 47, 48, 51, 52, 53, 54, 58, 61, 67, 77, 81, 82, 87, 96, 97, 101, 108, 110, 112, 115, 134]]
Health=['พิเศษที่สายสุขภาพ',[6,13]]
Geo=['พิเศษที่ภูมิศาสตร์',[93,124,130]]
Teach=['พิเศษที่คุรุศาสตร์',[24]]
Lang=['พิเศษที่ภาษา',[131,132]]
#
print_df=courses
text=''
cols=print_df.columns
for i in print_df.index:
msg=' Index ('+str(i)+') '
char_num_1=len(msg)
char_num_2=80-char_num_1
text+='#'*(char_num_2-(char_num_2//2))
text+=msg
text+='#'*(char_num_2//2)+'\n'
for col in cols:
text+=col+': '+str(print_df[col].loc[i])+'\n'
text+='\n\n'
open('/tmp/pretty_printed','w').write(text)
print_dict=data[30]
text=''
for key,value in print_dict.items():
text+=str(key)
text+=': '
text+=str(value)
text+='\n'
open('/tmp/pretty_printed','w').write(text)