diff --git a/texturizer/profanity.py b/texturizer/profanity.py index f841997..a3e6cc2 100755 --- a/texturizer/profanity.py +++ b/texturizer/profanity.py @@ -67,7 +67,7 @@ def prof_features(x, col): mild_profanity = 0 else: text = (x[col].lower()) - word_array = text.split() + word_array = text.split('\s+') hard_profanity = len(hard_re.findall(text)) mask_profanity = len(masked_re.findall(text)) if set(mild_profanity_list).intersection(word_array): diff --git a/texturizer/scarcity.py b/texturizer/scarcity.py index 44a0a75..ad3e610 100755 --- a/texturizer/scarcity.py +++ b/texturizer/scarcity.py @@ -53,7 +53,7 @@ def cal_features(x, col): else: text = remove_urls_and_tags( remove_escapes_and_non_printable( x[col] ) ).lower() text = text.translate(str.maketrans('', '', string.punctuation)).lower() - words = text.split() + words = text.split('\s+') scarcities = list(map(get_scarcity, words)) mean_scarcity = statistics.mean(scarcities) median_scarcity = statistics.median(scarcities) diff --git a/texturizer/simple.py b/texturizer/simple.py index dc9cace..f22f878 100755 --- a/texturizer/simple.py +++ b/texturizer/simple.py @@ -60,9 +60,9 @@ def cal_features(x, col): punct = sum(1 for c in x[col] if c in ['.','!','?',':',';','-',',']) capital_d = capitals/chars punct_d = punct/chars - word_array = x[col].lower().split() + word_array = x[col].lower().split('\s+') sentence_array = [ x for x in re.split("[.?]", x[col].lower()) if x] - line_array = [ x for x in re.split("[\r\n]*", x[col].lower()) if x] + line_array = [ x for x in re.split("[\r\n]+", x[col].lower()) if x] non_stop_words = list(set(word_array) - set(stop_word_list)) word_count = len(word_array) sentence_count = len(sentence_array) diff --git a/texturizer/topics.py b/texturizer/topics.py index fa6f3fd..0b30315 100755 --- a/texturizer/topics.py +++ b/texturizer/topics.py @@ -86,54 +86,10 @@ def add_text_topics_features(df, columns, type="flag"): for col in columns: if type=="count": rez = add_topic_counts(rez, col) - #rez = add_topic_features(rez, col) else: rez = add_topic_indicators(rez, col) return rez -######################################################################################## -def add_topic_features(df, col): - """ - Given a pandas dataframe and a column name. - Count the text matches for topic keywords. - """ - - def prof_features(x, col): - religion_wds = 0 - sex_wds = 0 - politics_wds = 0 - ethno_wds = 0 - econo_wds = 0 - health_wds = 0 - sport_wds = 0 - arts_wds = 0 - family_wds = 0 - love_wds = 0 - crime_wds = 0 - travel_wds = 0 - food_wds = 0 - if x[col]!=x[col]: - sex_wds = 0 - else: - text = (x[col].lower()) - religion_wds = len(religion_re.findall(text)) - politics_wds = len(politics_re.findall(text)) - sex_wds = len(sex_re.findall(text)) - ethno_wds = len(ethno_re.findall(text)) - econo_wds = len(econo_re.findall(text)) - health_wds = len(health_re.findall(text)) - sport_wds = len(sport_re.findall(text)) - arts_wds = len(arts_re.findall(text)) - family_wds = len(family_re.findall(text)) - love_wds = len(love_re.findall(text)) - crime_wds = len(crime_re.findall(text)) - travel_wds = len(travel_re.findall(text)) - food_wds = len(food_re.findall(text)) - return religion_wds, politics_wds, sex_wds, ethno_wds, econo_wds, health_wds, sport_wds, arts_wds, family_wds, love_wds, crime_wds, travel_wds, food_wds - - df[[ col+'_religion', col+'_politics', col+'_sex', col+'_ethnicity', col+'_economics', col+'_health', col+'_sport', col+'_arts', col+'_family', col+'_love', col+'_crime', col+'_travel', col+'_food']] = df.apply(prof_features, col=col, axis=1, result_type="expand") - return df - ######################################################################################## def add_topic_indicators(df, col): """ @@ -166,6 +122,16 @@ def add_topic_indicators(df, col): df.loc[(df[col].notnull()) & (df[col].str.contains(travel_pat)), col+'_travel' ]=1 df[ col+'_food' ]=0 df.loc[(df[col].notnull()) & (df[col].str.contains(food_pat)), col+'_food' ]=1 + df[col+'_technology']= 0 + df.loc[(df[col].notnull()) & (df[col].str.contains(technology_pat)), col+'_technology' ]=1 + df[col+'_fashion']=0 + df.loc[(df[col].notnull()) & (df[col].str.contains(fashion_pat)), col+'_fashion' ]=1 + df[col+'_culture']=0 + df.loc[(df[col].notnull()) & (df[col].str.contains(culture_pat)), col+'_culture' ]=1 + df[col+'_education']=0 + df.loc[(df[col].notnull()) & (df[col].str.contains(education_pat)), col+'_education' ]=1 + df[col+'_science']=0 + df.loc[(df[col].notnull()) & (df[col].str.contains(science_pat)), col+'_science' ]=1 return df