# this takes a while - after cleaning save as .csv df_clean=df.copy() df_clean['text']=df_clean['text'].apply(convert_text)df_clean.to_csv('mental_heath_EJC_cleaned.csv') df_clean.head()
df_bw=df_clean.copy() # big words only df_bw['text']=df_bw['text'].apply(remove_small_words,max_chars=2) df_bw.to_csv('mental_heath_EJC_big_words.csv') df_bw.head()
gloveX= np.vstack(df_bw['text'].apply(lambda x: nlp(x).vector).values)# save glove vectors as .json import json with open('gloveX_big.json', 'w') as f: json.dump(gloveX.tolist(), f)
# This uses everything but we want to use the training data only # Create a dictionary and corpus needed for topic modeling using gensim id2word = gensim.corpora.Dictionary(df_bw['text'].apply(lambda x: x.split())) corpus = [id2word.doc2bow(text) for text in df_bw['text'].apply(lambda x: x.split())]
print(len([w for w in id2word.values()])) # This is the vocab from the input corpora print(np.shape(corpus)) # This is DTM
# printing the topic associations with the documents count = 0 for i in lda_model_gs[corpus]: print("doc : ",count,i) count += 1
# limited sucess with this so will revisit later tddata = pd.DataFrame(np.array(topic_distributions)) # Create a linkage matrix that orders the columns from highest to lowest tddata = pd.DataFrame(np.array(topic_distributions)) linkage_matrix = linkage(tddata, method='complete', metric='correlation') column_order = list(linkage_matrix[:, 0].astype(int)) print(linkage_matrix[0]) print(column_order) # Reshape the linkage matrix to a 2D array # linkage_matrix = linkage_matrix.reshape(-1, 2) # print(np.shape(topic_distributions)) # Create the clustermap with the ordered columns # sns.clustermap(tddata,row_cluster=True, col_cluster=False, col_linkage=linkage_matrix) # Create the clustermap with the ordered columns # sns.clustermap(topic_distributions, row_cluster=False, col_cluster=False, dendrogram_col=column_order) # sns.clustermap(topic_distributions,row_cluster=False, col_cluster=False ,dendrogram_col=[]) # sns.clustermap(topic_distributions, metric='euclidean', method='ward', dendrogram='row')
# %%time # ANSWER # Define a function to calculate the number of punctuation marks in a text def count_punct(text): count = sum([1 for char in text if char in string.punctuation]) return count # Define a function to calculate the number of uppercase words in a text def count_upper(text): count = len([word for word in text.split() if word.isupper()]) return countdef count_pos_tags(text): doc = nlp(text) pos_tags = [token.pos_ for token in doc] pos_counts = Counter(pos_tags) keys_to_index = ["ADJ", "ADV", "NOUN", "NUM", "PRON", "PROPN", "VERB"] pos_counts = [pos_counts.get(key, 0) for key in keys_to_index] return pos_counts