import numpy as np # Question 1 docs = ["The sky is blue", "The sun is dark dark red", "The moon is gray"] # Question 2 def unique_words(documents): words = {} word_count = 0 for doc in documents: for word in doc.lower().split(): if word not in words: words[word] = word_count word_count += 1 return words # Question 3 def term_freq(documents): words = unique_words(documents) nwords = len(words.keys()) ndocs = len(documents) mat = np.zeros((ndocs, nwords)) for doc_index in range(len(documents)): doc = documents[doc_index] for word in doc.lower().split(): word_index = words[word] mat[doc_index][word_index] += 1 return mat # Question 4 def inv_doc_freq(tf_mat): ndocs, nterms = tf_mat.shape df = (np.sum(np.sign(tf_mat), 0) + 1.) / ndocs idf = np.log(1.0/df) return idf # Question 5 def norm_rows(mat): norms = np.matrix(np.sqrt(np.sum(np.power(mat, 2), 1))) return mat/norms.T # Question 6 tf_mat = term_freq(docs) idf = inv_doc_freq(tf_mat) mat = np.dot(tf_mat, np.diag(idf)) tfidf = norm_rows(mat) # Question 7 print "(0, 1)", np.dot(tfidf[0], tfidf[1].T) print "(0, 2)", np.dot(tfidf[0], tfidf[2].T) print "(1, 2)", np.dot(tfidf[1], tfidf[2].T)