yuvipanda/calculate.mprof

## calculate.mprof
Filename: /home/yuvipanda/align/calculate_alignment.py

Line #    Mem usage    Increment   Line Contents
================================================
   715    310.1 MiB    310.1 MiB   @profile(stream=open("calculate_alignment.mprof", "w"))
   716                             def calculate_alignment(input_files,
   717                                                     output_file_directory,
   718                                                     semantic_model_input_file,
   719                                                     pretrained_input_file,
   720                                                     high_sd_cutoff=3,
   721                                                     low_n_cutoff=1,
   722                                                     delay=1,
   723                                                     maxngram=2,
   724                                                     use_pretrained_vectors=True,
   725                                                     ignore_duplicates=True,
   726                                                     add_stanford_tags=False,
   727                                                     input_as_directory=True):
   728
   729                                 """
   730                                 Calculate lexical, syntactic, and conceptual alignment between speakers.
   731
   732                                 Given a directory of individual .txt files and the
   733                                 vocabulary list that have been generated by the `prepare_transcripts`
   734                                 preparation stage, return multi-level alignment
   735                                 scores with turn-by-turn and conversation-level metrics.
   736
   737                                 Parameters
   738                                 ----------
   739
   740                                 input_files : str (directory name) or list of str (file names)
   741                                     Cleaned files to be analyzed. Behavior governed by `input_as_directory`
   742                                     parameter as well.
   743
   744                                 output_file_directory : str
   745                                     Name of directory where output for individual conversations will be
   746                                     saved.
   747
   748                                 semantic_model_input_file : str
   749                                     Name of file to be used for creating the semantic model. A compatible
   750                                     file will be saved as an output of `prepare_transcripts()`.
   751
   752                                 pretrained_input_file : str or None
   753                                     If using a pretrained vector to create the semantic model, use
   754                                     name of model here. If not, use None. Behavior governed by
   755                                     `use_pretrained_vectors` parameter as well.
   756
   757                                 high_sd_cutoff : int, optional (default: 3)
   758                                     High-frequency cutoff (in SD over the mean) for lexical items
   759                                     when creating the semantic model.
   760
   761                                 low_n_cutoff : int, optional (default: 1)
   762                                     Low-frequency cutoff (in raw frequency) for lexical items when
   763                                     creating the semantic models. Items with frequency less than or
   764                                     equal to the number provided here will be removed. To remove the
   765                                     low-frequency cutoff, set to 0.
   766
   767                                 delay : int, optional (default: 1)
   768                                     Delay (or lag) at which to calculate similarity. A lag of 1 (default)
   769                                     considers only adjacent turns.
   770
   771                                 maxngram : int, optional (default: 2)
   772                                     Maximum n-gram size for calculations. Similarity scores for n-grams
   773                                     from unigrams to the maximum size specified here will be calculated.
   774
   775                                 use_pretrained_vectors : boolean, optional (default: True)
   776                                     Specify whether to use a pretrained gensim model for word2vec
   777                                     analysis (True) or to construct a new model from the provided corpus
   778                                     (False). If True, the file name of a valid model must be
   779                                     provided to the `pretrained_input_file` parameter.
   780
   781                                 ignore_duplicates : boolean, optional (default: True)
   782                                     Specify whether to remove exact duplicates when calculating
   783                                     part-of-speech similarity scores (True) or to retain perfectly
   784                                     mimicked lexical items for POS similarity calculation (False).
   785
   786                                 add_stanford_tags : boolean, optional (default: False)
   787                                     Specify whether to return part-of-speech similarity scores based on
   788                                     Stanford POS tagger in addition to the Penn POS tagger (True) or to
   789                                     return only POS similarity scores from the Penn tagger (False). (Note:
   790                                     Including Stanford POS tags will lead to a significant increase in
   791                                     processing time.)
   792
   793                                 input_as_directory : boolean, optional (default: True)
   794                                     Specify whether the value passed to `input_files` parameter should
   795                                     be read as a directory (True) or a list of files to be processed
   796                                     (False).
   797
   798                                 Returns
   799                                 -------
   800
   801                                 real_final_turn_df : Pandas DataFrame
   802                                     A dataframe of lexical, syntactic, and conceptual alignment scores
   803                                     between turns at specified delay. `NaN` values will be returned for
   804                                     turns in which the speaker only produced words that were removed
   805                                     from the corpus (e.g., too rare or too common words) or words that were
   806                                     present in the corpus but not in the semantic model.
   807
   808                                 real_final_convo_df : Pandas DataFrame
   809                                     A dataframe of lexical, syntactic, and conceptual alignment scores
   810                                     between participants across the entire conversation.
   811
   812                                 """
   813
   814                                 # grab the files in the list
   815    310.1 MiB      0.0 MiB       if not input_as_directory:
   816                                     file_list = glob.glob(input_files)
   817                                 else:
   818    310.1 MiB      0.0 MiB           file_list = glob.glob(input_files+"/*.txt")
   819
   820                                 # build the semantic model to be used for all conversations
   821    310.1 MiB      0.0 MiB       [vocablist, highDimModel] = BuildSemanticModel(semantic_model_input_file=semantic_model_input_file,
   822    310.1 MiB      0.0 MiB                                                          pretrained_input_file=pretrained_input_file,
   823    310.1 MiB      0.0 MiB                                                          use_pretrained_vectors=use_pretrained_vectors,
   824    310.1 MiB      0.0 MiB                                                          high_sd_cutoff=high_sd_cutoff,
   825   5440.8 MiB   5130.8 MiB                                                          low_n_cutoff=low_n_cutoff)
   826
   827                                 # create containers for alignment values
   828   5440.8 MiB      0.0 MiB       AlignmentT2T = pd.DataFrame()
   829   5440.8 MiB      0.0 MiB       AlignmentC2C = pd.DataFrame()
   830
   831                                 # cycle through each prepared file
   832   5445.3 MiB      0.0 MiB       for fileName in file_list:
   833
   834                                     # process the file if it's got a valid conversation
   835   5444.8 MiB      0.0 MiB           dataframe=pd.read_csv(fileName, sep='\t',encoding='utf-8')
   836   5444.8 MiB      0.0 MiB           if len(dataframe) > 1:
   837
   838                                         # let us know which filename we're processing
   839   5444.8 MiB      0.0 MiB               print("Processing: "+fileName)
   840
   841                                         # calculate turn-by-turn alignment scores
   842   5444.8 MiB      0.0 MiB               xT2T=TurnByTurnAnalysis(dataframe=dataframe,
   843   5444.8 MiB      0.0 MiB                                            delay=delay,
   844   5444.8 MiB      0.0 MiB                                            maxngram=maxngram,
   845   5444.8 MiB      0.0 MiB                                            vocablist=vocablist,
   846   5444.8 MiB      0.0 MiB                                            highDimModel=highDimModel,
   847   5444.8 MiB      0.0 MiB                                            add_stanford_tags=add_stanford_tags,
   848   5445.3 MiB      3.0 MiB                                            ignore_duplicates=ignore_duplicates)
   849   5445.3 MiB      0.0 MiB               AlignmentT2T=AlignmentT2T.append(xT2T)
   850
   851                                         # calculate conversation-level alignment scores
   852   5445.3 MiB      0.0 MiB               xC2C = ConvoByConvoAnalysis(dataframe=dataframe,
   853   5445.3 MiB      0.0 MiB                                                maxngram = maxngram,
   854   5445.3 MiB      0.0 MiB                                                ignore_duplicates=ignore_duplicates,
   855   5445.3 MiB      1.5 MiB                                                add_stanford_tags = add_stanford_tags)
   856   5445.3 MiB      0.0 MiB               AlignmentC2C=AlignmentC2C.append(xC2C)
   857
   858                                     # if it's invalid, let us know
   859                                     else:
   860                                         print("Invalid file: "+fileName)
   861
   862                                 # update final dataframes
   863   5445.3 MiB      0.0 MiB       real_final_turn_df = AlignmentT2T.reset_index(drop=True)
   864   5445.3 MiB      0.0 MiB       real_final_convo_df = AlignmentC2C.reset_index(drop=True)
   865
   866                                 # export the final files
   867   5445.3 MiB      0.0 MiB       real_final_turn_df.to_csv(output_file_directory
	Filename: /home/yuvipanda/align/calculate_alignment.py

	Line # Mem usage Increment Line Contents
	================================================
	715 310.1 MiB 310.1 MiB @profile(stream=open("calculate_alignment.mprof", "w"))
	716 def calculate_alignment(input_files,
	717 output_file_directory,
	718 semantic_model_input_file,
	719 pretrained_input_file,
	720 high_sd_cutoff=3,
	721 low_n_cutoff=1,
	722 delay=1,
	723 maxngram=2,
	724 use_pretrained_vectors=True,
	725 ignore_duplicates=True,
	726 add_stanford_tags=False,
	727 input_as_directory=True):
	728
	729 """
	730 Calculate lexical, syntactic, and conceptual alignment between speakers.
	731
	732 Given a directory of individual .txt files and the
	733 vocabulary list that have been generated by the `prepare_transcripts`
	734 preparation stage, return multi-level alignment
	735 scores with turn-by-turn and conversation-level metrics.
	736
	737 Parameters
	738 ----------
	739
	740 input_files : str (directory name) or list of str (file names)
	741 Cleaned files to be analyzed. Behavior governed by `input_as_directory`
	742 parameter as well.
	743
	744 output_file_directory : str
	745 Name of directory where output for individual conversations will be
	746 saved.
	747
	748 semantic_model_input_file : str
	749 Name of file to be used for creating the semantic model. A compatible
	750 file will be saved as an output of `prepare_transcripts()`.
	751
	752 pretrained_input_file : str or None
	753 If using a pretrained vector to create the semantic model, use
	754 name of model here. If not, use None. Behavior governed by
	755 `use_pretrained_vectors` parameter as well.
	756
	757 high_sd_cutoff : int, optional (default: 3)
	758 High-frequency cutoff (in SD over the mean) for lexical items
	759 when creating the semantic model.
	760
	761 low_n_cutoff : int, optional (default: 1)
	762 Low-frequency cutoff (in raw frequency) for lexical items when
	763 creating the semantic models. Items with frequency less than or
	764 equal to the number provided here will be removed. To remove the
	765 low-frequency cutoff, set to 0.
	766
	767 delay : int, optional (default: 1)
	768 Delay (or lag) at which to calculate similarity. A lag of 1 (default)
	769 considers only adjacent turns.
	770
	771 maxngram : int, optional (default: 2)
	772 Maximum n-gram size for calculations. Similarity scores for n-grams
	773 from unigrams to the maximum size specified here will be calculated.
	774
	775 use_pretrained_vectors : boolean, optional (default: True)
	776 Specify whether to use a pretrained gensim model for word2vec
	777 analysis (True) or to construct a new model from the provided corpus
	778 (False). If True, the file name of a valid model must be
	779 provided to the `pretrained_input_file` parameter.
	780
	781 ignore_duplicates : boolean, optional (default: True)
	782 Specify whether to remove exact duplicates when calculating
	783 part-of-speech similarity scores (True) or to retain perfectly
	784 mimicked lexical items for POS similarity calculation (False).
	785
	786 add_stanford_tags : boolean, optional (default: False)
	787 Specify whether to return part-of-speech similarity scores based on
	788 Stanford POS tagger in addition to the Penn POS tagger (True) or to
	789 return only POS similarity scores from the Penn tagger (False). (Note:
	790 Including Stanford POS tags will lead to a significant increase in
	791 processing time.)
	792
	793 input_as_directory : boolean, optional (default: True)
	794 Specify whether the value passed to `input_files` parameter should
	795 be read as a directory (True) or a list of files to be processed
	796 (False).
	797
	798 Returns
	799 -------
	800
	801 real_final_turn_df : Pandas DataFrame
	802 A dataframe of lexical, syntactic, and conceptual alignment scores
	803 between turns at specified delay. `NaN` values will be returned for
	804 turns in which the speaker only produced words that were removed
	805 from the corpus (e.g., too rare or too common words) or words that were
	806 present in the corpus but not in the semantic model.
	807
	808 real_final_convo_df : Pandas DataFrame
	809 A dataframe of lexical, syntactic, and conceptual alignment scores
	810 between participants across the entire conversation.
	811
	812 """
	813
	814 # grab the files in the list
	815 310.1 MiB 0.0 MiB if not input_as_directory:
	816 file_list = glob.glob(input_files)
	817 else:
	818 310.1 MiB 0.0 MiB file_list = glob.glob(input_files+"/*.txt")
	819
	820 # build the semantic model to be used for all conversations
	821 310.1 MiB 0.0 MiB [vocablist, highDimModel] = BuildSemanticModel(semantic_model_input_file=semantic_model_input_file,
	822 310.1 MiB 0.0 MiB pretrained_input_file=pretrained_input_file,
	823 310.1 MiB 0.0 MiB use_pretrained_vectors=use_pretrained_vectors,
	824 310.1 MiB 0.0 MiB high_sd_cutoff=high_sd_cutoff,
	825 5440.8 MiB 5130.8 MiB low_n_cutoff=low_n_cutoff)
	826
	827 # create containers for alignment values
	828 5440.8 MiB 0.0 MiB AlignmentT2T = pd.DataFrame()
	829 5440.8 MiB 0.0 MiB AlignmentC2C = pd.DataFrame()
	830
	831 # cycle through each prepared file
	832 5445.3 MiB 0.0 MiB for fileName in file_list:
	833
	834 # process the file if it's got a valid conversation
	835 5444.8 MiB 0.0 MiB dataframe=pd.read_csv(fileName, sep='\t',encoding='utf-8')
	836 5444.8 MiB 0.0 MiB if len(dataframe) > 1:
	837
	838 # let us know which filename we're processing
	839 5444.8 MiB 0.0 MiB print("Processing: "+fileName)
	840
	841 # calculate turn-by-turn alignment scores
	842 5444.8 MiB 0.0 MiB xT2T=TurnByTurnAnalysis(dataframe=dataframe,
	843 5444.8 MiB 0.0 MiB delay=delay,
	844 5444.8 MiB 0.0 MiB maxngram=maxngram,
	845 5444.8 MiB 0.0 MiB vocablist=vocablist,
	846 5444.8 MiB 0.0 MiB highDimModel=highDimModel,
	847 5444.8 MiB 0.0 MiB add_stanford_tags=add_stanford_tags,
	848 5445.3 MiB 3.0 MiB ignore_duplicates=ignore_duplicates)
	849 5445.3 MiB 0.0 MiB AlignmentT2T=AlignmentT2T.append(xT2T)
	850
	851 # calculate conversation-level alignment scores
	852 5445.3 MiB 0.0 MiB xC2C = ConvoByConvoAnalysis(dataframe=dataframe,
	853 5445.3 MiB 0.0 MiB maxngram = maxngram,
	854 5445.3 MiB 0.0 MiB ignore_duplicates=ignore_duplicates,
	855 5445.3 MiB 1.5 MiB add_stanford_tags = add_stanford_tags)
	856 5445.3 MiB 0.0 MiB AlignmentC2C=AlignmentC2C.append(xC2C)
	857
	858 # if it's invalid, let us know
	859 else:
	860 print("Invalid file: "+fileName)
	861
	862 # update final dataframes
	863 5445.3 MiB 0.0 MiB real_final_turn_df = AlignmentT2T.reset_index(drop=True)
	864 5445.3 MiB 0.0 MiB real_final_convo_df = AlignmentC2C.reset_index(drop=True)
	865
	866 # export the final files
	867 5445.3 MiB 0.0 MiB real_final_turn_df.to_csv(output_file_directory