diff --git a/README.md b/README.md index d15d856..898300d 100755 --- a/README.md +++ b/README.md @@ -58,18 +58,22 @@ For string based text comparisons we use [jellyfish](https://pypi.org/project/je Each type of feature can be unlocked through the use of a specific command line switch: -* -topics. Indicators for presence of words from common topics. -* -topics=count. Counts of all word matches from common topics. -* -pos. Part of speech proportions in the text. -* -literacy. Checks for common literacy markers. -* -traits. Checks for common stylistic elements or traits that suggest personality type. -* -rhetoric. Checks for rhetorical devices used for persuasion -* -profanity. Profanity check flags. -* -sentiment. Sentiment word counts and score. -* -scarcity. Word scarcity scores. -* -emoticons. Emoticon check flags. -* -embedding. Word embedding vectors from the Spacy Package. -* -comparison. Cross-column comparisons using edit distance metrics +``` + -topics Default: False. Indicators for words from common topics. + -topics=count Count matching words from common topics. + -topics=normalize Count matching topic key words and normalize over topics. + -traits Default: False. Word usage for personality traits. + -rhetoric Default: False. Word usage for rhetorical devices. + -pos Default: False. Part of speech proportions. + -literacy Default: False. Checks for common literacy markers. + -profanity Default: False. Profanity check flags. + -sentiment Default: False. Words counts for positive and negative sentiment. + -scarcity Default: False. Word scarcity scores. + -emoticons Default: False. Emoticon check flags. + -embedding Default: False. Aggregate of Word Embedding Vectors. + -embedding=normalize Normalised Aggregate of Word Embedding Vectors. + -comparison Default: False. Cross-column comparisons. +``` ## Usage diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 8fa6af9..c43e239 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -23,17 +23,21 @@ Without parameters it will print out an error and the following usage : - Supported file types: csv, tsv, xls, xlsx, odf [ARGS] In most cases these are switches that turn on the feature type -columns=. REQUIRED - -topics OR -topics=count. Default: False. Match words from common topics (or count matches). - -traits Default: False. Word usage for personality traits. - -rhetoric Default: False. Word usage for rhetorical devices. - -pos Default: False. Part of speech proportions. - -literacy Default: False. Checks for common literacy markers. - -profanity Default: False. Profanity check flags. - -sentiment Default: False. Words counts for positive and negative sentiment. - -scarcity Default: False. Word scarcity scores. - -emoticons Default: False. Emoticon check flags. - -embedding Default: False. Normalised Aggregate of Word Embedding Vectors. - -comparison Default: False. Cross-column comparisons. + -topics Default: False. Indicators for words from common topics. + -topics=count Count matching words from common topics. + -topics=normalize Count matching topic key words and normalize over topics. + -traits Default: False. Word usage for personality traits. + -rhetoric Default: False. Word usage for rhetorical devices. + -pos Default: False. Part of speech proportions. + -literacy Default: False. Checks for common literacy markers. + -profanity Default: False. Profanity check flags. + -sentiment Default: False. Words counts for positive and negative sentiment. + -scarcity Default: False. Word scarcity scores. + -emoticons Default: False. Emoticon check flags. + -embedding Default: False. Aggregate of Word Embedding Vectors. + -embedding=normalize Normalised Aggregate of Word Embedding Vectors. + -comparison Default: False. Cross-column comparisons. + The list of columns to process and the path to the dataset are both mandatory. diff --git a/texturizer/__init__.py b/texturizer/__init__.py index 95282a6..569f6f4 100755 --- a/texturizer/__init__.py +++ b/texturizer/__init__.py @@ -1,4 +1,4 @@ -__version__ = "0.1.7" +__version__ = "0.1.8" from .pipeline import TextTransform diff --git a/texturizer/embedding.py b/texturizer/embedding.py index 6cd1be8..dd4bb5d 100755 --- a/texturizer/embedding.py +++ b/texturizer/embedding.py @@ -22,18 +22,18 @@ """ ######################################################################################## -def add_text_embedding_features(df, columns): +def add_text_embedding_features(df, columns, type="sum"): """ Given a pandas dataframe and a set of column names. calculate the embedding features and add them. """ rez = df.copy() for col in columns: - rez = add_embedding_features(rez, col) + rez = add_embedding_features(rez, col, type) return rez ######################################################################################## -def add_embedding_features(df, col): +def add_embedding_features(df, col, type): """ Given a pandas dataframe and a column name. add features that embed the text content into a semantic space. @@ -50,7 +50,8 @@ def vec_feats(x, col): for token in doc: vec = vec + token.vector index = index + 1 - vec = vec / index + if type=='normalize': + vec = vec / index return vec.tolist() df[ get_vec_column_names(col) ] = df.apply(vec_feats, col=col, axis=1, result_type="expand") diff --git a/texturizer/featurize.py b/texturizer/featurize.py index f55be94..41bb4f2 100755 --- a/texturizer/featurize.py +++ b/texturizer/featurize.py @@ -53,12 +53,18 @@ def process_df(df, params): end_profile("emoticons") if params["embedding"] : start_profile("embedding") - simple = add_text_embedding_features( simple, params["columns"] ) + if params["normalize_embedding"] : + simple = add_text_embedding_features( simple, params["columns"], 'normalize' ) + else: + simple = add_text_embedding_features( simple, params["columns"] ) end_profile("embedding") if params["topics"] : start_profile("topics") if params["count_matches"] : - simple = add_text_topics_features( simple, params["columns"], 'count' ) + if params["normalize_topics"] : + simple = add_text_topics_features( simple, params["columns"], 'normalize' ) + else: + simple = add_text_topics_features( simple, params["columns"], 'count' ) else: simple = add_text_topics_features( simple, params["columns"] ) end_profile("topics") diff --git a/texturizer/pipeline.py b/texturizer/pipeline.py index 3e62b0b..57823cb 100644 --- a/texturizer/pipeline.py +++ b/texturizer/pipeline.py @@ -64,7 +64,9 @@ def generate_feature_config(self, columns, params): "literacy":False, "scarcity":False, "comparison":False, - "embedding":False + "embedding":False, + "normalize_embedding":False, + "normalize_topics":False, } if "profanity" in params: result["profanity"]=True @@ -76,8 +78,13 @@ def generate_feature_config(self, columns, params): result["emoticons"]=True if "topics" in params: result["topics"]=True - if "count_matches" in params: + if "topics=count" in params: + result["topics"]=True + result["count_matches"]=True + if "topics=normalize" in params: + result["topics"]=True result["count_matches"]=True + result["normalize_topics"]=True if "pos" in params: result["pos"]=True if "traits" in params: @@ -90,6 +97,9 @@ def generate_feature_config(self, columns, params): result["comparison"]=True if "embedding" in params: result["embedding"]=True + if "embedding=normalize" in params: + result["embedding"]=True + result["normalize_embedding"]=True return result diff --git a/texturizer/texturizer.py b/texturizer/texturizer.py index afe101f..fd98906 100755 --- a/texturizer/texturizer.py +++ b/texturizer/texturizer.py @@ -65,7 +65,9 @@ def get_cmd_line_params(argv): "literacy":False, "scarcity":False, "comparison":False, - "embedding":False + "embedding":False, + "normalize_embedding":False, + "normalize_topics":False, } for o in options: parts = o.split("=") @@ -82,6 +84,9 @@ def get_cmd_line_params(argv): if len(parts)>1: if parts[1] == 'count': result["count_matches"]=True + if parts[1] == 'normalize': + result["count_matches"]=True + result["normalize_topics"]=True if parts[0] == "-traits": result["traits"]=True if parts[0] == "-rhetoric": @@ -92,6 +97,9 @@ def get_cmd_line_params(argv): result["emoticons"]=True if parts[0] == "-embedding": result["embedding"]=True + if len(parts)>1: + if parts[1] == 'normalize': + result["normalize_embedding"]=True if parts[0] == "-comparison": result["comparison"]=True if parts[0] == "-columns": @@ -108,17 +116,20 @@ def print_usage(args): print(" - Supported file types: csv, tsv, xls, xlsx, odf") print(" [ARGS] In most cases these are switches that turn on the feature type") print(" -columns=. REQUIRED") - print(" -topics OR -topics=count. Default: False. Match words from common topics (or count matches).") - print(" -traits Default: False. Word usage for personality traits.") - print(" -rhetoric Default: False. Word usage for rhetorical devices.") - print(" -pos Default: False. Part of speech proportions.") - print(" -literacy Default: False. Checks for common literacy markers.") - print(" -profanity Default: False. Profanity check flags.") - print(" -sentiment Default: False. Words counts for positive and negative sentiment.") - print(" -scarcity Default: False. Word scarcity scores.") - print(" -emoticons Default: False. Emoticon check flags.") - print(" -embedding Default: False. Normalised Aggregate of Word Embedding Vectors.") - print(" -comparison Default: False. Cross-column comparisons.") + print(" -topics Default: False. Indicators for words from common topics.") + print(" -topics=count Count matching words from common topics.") + print(" -topics=normalize Count matching topic key words and normalize over topics.") + print(" -traits Default: False. Word usage for personality traits.") + print(" -rhetoric Default: False. Word usage for rhetorical devices.") + print(" -pos Default: False. Part of speech proportions.") + print(" -literacy Default: False. Checks for common literacy markers.") + print(" -profanity Default: False. Profanity check flags.") + print(" -sentiment Default: False. Words counts for positive and negative sentiment.") + print(" -scarcity Default: False. Word scarcity scores.") + print(" -emoticons Default: False. Emoticon check flags.") + print(" -embedding Default: False. Aggregate of Word Embedding Vectors.") + print(" -embedding=normalize Normalised Aggregate of Word Embedding Vectors.") + print(" -comparison Default: False. Cross-column comparisons.") print("") diff --git a/texturizer/topics.py b/texturizer/topics.py index 4e07d9f..f20fbcd 100755 --- a/texturizer/topics.py +++ b/texturizer/topics.py @@ -86,6 +86,8 @@ def add_text_topics_features(df, columns, type="flag"): for col in columns: if type=="count": rez = add_topic_counts(rez, col) + if type=="normalize": + rez = add_topic_counts(rez, col, normalize=True) else: rez = add_topic_indicators(rez, col) return rez @@ -136,7 +138,7 @@ def add_topic_indicators(df, col): return df ######################################################################################## -def add_topic_counts(df, col): +def add_topic_counts(df, col, normalize=False): """ Given a pandas dataframe and a column name. Count the number of keyword matches for each topic @@ -159,5 +161,26 @@ def add_topic_counts(df, col): df[col+'_culture']=df[col].str.count(culture_pat, flags=re.IGNORECASE) df[col+'_education']=df[col].str.count(education_pat, flags=re.IGNORECASE) df[col+'_science']=df[col].str.count(science_pat, flags=re.IGNORECASE) + if normalize: + totals = df[col+'_religion'] + df[col+'_politics'] + df[col+'_sex']+ df[col+'_ethnicity']+ df[col+'_economics']+ df[col+'_health']+ df[col+'_sport']+ df[col+'_arts']+ df[col+'_family']+ df[col+'_love']+ df[col+'_crime']+ df[col+'_travel']+ df[col+'_food']+ df[col+'_technology']+ df[col+'_fashion']+ df[col+'_culture']+ df[col+'_education']+df[col+'_science'] + 1 + df[col+'_religion']=df[col+'_religion']/totals + df[col+'_politics']=df[col+'_politics']/totals + df[col+'_sex']=df[col+'_sex']/totals + df[col+'_ethnicity']=df[col+'_ethnicity']/totals + df[col+'_economics']=df[col+'_economics']/totals + df[col+'_health']=df[col+'_health']/totals + df[col+'_sport']=df[col+'_sport']/totals + df[col+'_arts']=df[col+'_arts']/totals + df[col+'_family']=df[col+'_family']/totals + df[col+'_love']=df[col+'_love']/totals + df[col+'_crime']=df[col+'_crime']/totals + df[col+'_travel']=df[col+'_travel']/totals + df[col+'_food']=df[col+'_food']/totals + df[col+'_technology']=df[col+'_technology']/totals + df[col+'_fashion']=df[col+'_fashion']/totals + df[col+'_culture']=df[col+'_culture']/totals + df[col+'_education']=df[col+'_education']/totals + df[col+'_science']=df[col+'_science']/totals + return df