Created
February 16, 2018 05:51
-
-
Save yuvipanda/0fdeeba94ded6127dd073fd58c896f8d to your computer and use it in GitHub Desktop.
Memory profiling
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Filename: /home/yuvipanda/align/calculate_alignment.py | |
Line # Mem usage Increment Line Contents | |
================================================ | |
715 310.1 MiB 310.1 MiB @profile(stream=open("calculate_alignment.mprof", "w")) | |
716 def calculate_alignment(input_files, | |
717 output_file_directory, | |
718 semantic_model_input_file, | |
719 pretrained_input_file, | |
720 high_sd_cutoff=3, | |
721 low_n_cutoff=1, | |
722 delay=1, | |
723 maxngram=2, | |
724 use_pretrained_vectors=True, | |
725 ignore_duplicates=True, | |
726 add_stanford_tags=False, | |
727 input_as_directory=True): | |
728 | |
729 """ | |
730 Calculate lexical, syntactic, and conceptual alignment between speakers. | |
731 | |
732 Given a directory of individual .txt files and the | |
733 vocabulary list that have been generated by the `prepare_transcripts` | |
734 preparation stage, return multi-level alignment | |
735 scores with turn-by-turn and conversation-level metrics. | |
736 | |
737 Parameters | |
738 ---------- | |
739 | |
740 input_files : str (directory name) or list of str (file names) | |
741 Cleaned files to be analyzed. Behavior governed by `input_as_directory` | |
742 parameter as well. | |
743 | |
744 output_file_directory : str | |
745 Name of directory where output for individual conversations will be | |
746 saved. | |
747 | |
748 semantic_model_input_file : str | |
749 Name of file to be used for creating the semantic model. A compatible | |
750 file will be saved as an output of `prepare_transcripts()`. | |
751 | |
752 pretrained_input_file : str or None | |
753 If using a pretrained vector to create the semantic model, use | |
754 name of model here. If not, use None. Behavior governed by | |
755 `use_pretrained_vectors` parameter as well. | |
756 | |
757 high_sd_cutoff : int, optional (default: 3) | |
758 High-frequency cutoff (in SD over the mean) for lexical items | |
759 when creating the semantic model. | |
760 | |
761 low_n_cutoff : int, optional (default: 1) | |
762 Low-frequency cutoff (in raw frequency) for lexical items when | |
763 creating the semantic models. Items with frequency less than or | |
764 equal to the number provided here will be removed. To remove the | |
765 low-frequency cutoff, set to 0. | |
766 | |
767 delay : int, optional (default: 1) | |
768 Delay (or lag) at which to calculate similarity. A lag of 1 (default) | |
769 considers only adjacent turns. | |
770 | |
771 maxngram : int, optional (default: 2) | |
772 Maximum n-gram size for calculations. Similarity scores for n-grams | |
773 from unigrams to the maximum size specified here will be calculated. | |
774 | |
775 use_pretrained_vectors : boolean, optional (default: True) | |
776 Specify whether to use a pretrained gensim model for word2vec | |
777 analysis (True) or to construct a new model from the provided corpus | |
778 (False). If True, the file name of a valid model must be | |
779 provided to the `pretrained_input_file` parameter. | |
780 | |
781 ignore_duplicates : boolean, optional (default: True) | |
782 Specify whether to remove exact duplicates when calculating | |
783 part-of-speech similarity scores (True) or to retain perfectly | |
784 mimicked lexical items for POS similarity calculation (False). | |
785 | |
786 add_stanford_tags : boolean, optional (default: False) | |
787 Specify whether to return part-of-speech similarity scores based on | |
788 Stanford POS tagger in addition to the Penn POS tagger (True) or to | |
789 return only POS similarity scores from the Penn tagger (False). (Note: | |
790 Including Stanford POS tags will lead to a significant increase in | |
791 processing time.) | |
792 | |
793 input_as_directory : boolean, optional (default: True) | |
794 Specify whether the value passed to `input_files` parameter should | |
795 be read as a directory (True) or a list of files to be processed | |
796 (False). | |
797 | |
798 Returns | |
799 ------- | |
800 | |
801 real_final_turn_df : Pandas DataFrame | |
802 A dataframe of lexical, syntactic, and conceptual alignment scores | |
803 between turns at specified delay. `NaN` values will be returned for | |
804 turns in which the speaker only produced words that were removed | |
805 from the corpus (e.g., too rare or too common words) or words that were | |
806 present in the corpus but not in the semantic model. | |
807 | |
808 real_final_convo_df : Pandas DataFrame | |
809 A dataframe of lexical, syntactic, and conceptual alignment scores | |
810 between participants across the entire conversation. | |
811 | |
812 """ | |
813 | |
814 # grab the files in the list | |
815 310.1 MiB 0.0 MiB if not input_as_directory: | |
816 file_list = glob.glob(input_files) | |
817 else: | |
818 310.1 MiB 0.0 MiB file_list = glob.glob(input_files+"/*.txt") | |
819 | |
820 # build the semantic model to be used for all conversations | |
821 310.1 MiB 0.0 MiB [vocablist, highDimModel] = BuildSemanticModel(semantic_model_input_file=semantic_model_input_file, | |
822 310.1 MiB 0.0 MiB pretrained_input_file=pretrained_input_file, | |
823 310.1 MiB 0.0 MiB use_pretrained_vectors=use_pretrained_vectors, | |
824 310.1 MiB 0.0 MiB high_sd_cutoff=high_sd_cutoff, | |
825 5440.8 MiB 5130.8 MiB low_n_cutoff=low_n_cutoff) | |
826 | |
827 # create containers for alignment values | |
828 5440.8 MiB 0.0 MiB AlignmentT2T = pd.DataFrame() | |
829 5440.8 MiB 0.0 MiB AlignmentC2C = pd.DataFrame() | |
830 | |
831 # cycle through each prepared file | |
832 5445.3 MiB 0.0 MiB for fileName in file_list: | |
833 | |
834 # process the file if it's got a valid conversation | |
835 5444.8 MiB 0.0 MiB dataframe=pd.read_csv(fileName, sep='\t',encoding='utf-8') | |
836 5444.8 MiB 0.0 MiB if len(dataframe) > 1: | |
837 | |
838 # let us know which filename we're processing | |
839 5444.8 MiB 0.0 MiB print("Processing: "+fileName) | |
840 | |
841 # calculate turn-by-turn alignment scores | |
842 5444.8 MiB 0.0 MiB xT2T=TurnByTurnAnalysis(dataframe=dataframe, | |
843 5444.8 MiB 0.0 MiB delay=delay, | |
844 5444.8 MiB 0.0 MiB maxngram=maxngram, | |
845 5444.8 MiB 0.0 MiB vocablist=vocablist, | |
846 5444.8 MiB 0.0 MiB highDimModel=highDimModel, | |
847 5444.8 MiB 0.0 MiB add_stanford_tags=add_stanford_tags, | |
848 5445.3 MiB 3.0 MiB ignore_duplicates=ignore_duplicates) | |
849 5445.3 MiB 0.0 MiB AlignmentT2T=AlignmentT2T.append(xT2T) | |
850 | |
851 # calculate conversation-level alignment scores | |
852 5445.3 MiB 0.0 MiB xC2C = ConvoByConvoAnalysis(dataframe=dataframe, | |
853 5445.3 MiB 0.0 MiB maxngram = maxngram, | |
854 5445.3 MiB 0.0 MiB ignore_duplicates=ignore_duplicates, | |
855 5445.3 MiB 1.5 MiB add_stanford_tags = add_stanford_tags) | |
856 5445.3 MiB 0.0 MiB AlignmentC2C=AlignmentC2C.append(xC2C) | |
857 | |
858 # if it's invalid, let us know | |
859 else: | |
860 print("Invalid file: "+fileName) | |
861 | |
862 # update final dataframes | |
863 5445.3 MiB 0.0 MiB real_final_turn_df = AlignmentT2T.reset_index(drop=True) | |
864 5445.3 MiB 0.0 MiB real_final_convo_df = AlignmentC2C.reset_index(drop=True) | |
865 | |
866 # export the final files | |
867 5445.3 MiB 0.0 MiB real_final_turn_df.to_csv(output_file_directory |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment