# All Import Statements Defined Here
# Note: Do not add to this list.
# All the dependencies you need, can be installed by running .
# ----------------import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCASTART_TOKEN = '<START>'
END_TOKEN = '<END>'np.random.seed(0)
random.seed(0)
# ----------------
Part 1: Count-Based Word Vectors (10 points) 接下來就是詞向量的內容了,涉及共現矩陣以及利用SVD進行降維,最后畫出共現矩陣的詞向量。
def distinct_words(corpus):""" Determine a list of distinct words for the corpus.Params:corpus (list of list of strings): corpus of documentsReturn:corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)num_corpus_words (integer): number of distinct words across the corpus""" corpus_words = []num_corpus_words = -1# ------------------# Write your implementation here.corpus_words = sorted(list({word for words in corpus for word in words}))num_corpus_words = len(corpus_words)# ------------------return corpus_words, num_corpus_words
# ---------------------
# Run this sanity check
# Note that this not an exhaustive check for correctness.
# ---------------------# Define toy corpus
test_corpus = ["START All that glitters isn't gold END".split(" "), "START All's well that ends well END".split(" ")]
test_corpus_words, num_corpus_words = distinct_words(test_corpus)# Correct answers
ans_test_corpus_words = sorted(list(set(["START", "All", "ends", "that", "gold", "All's", "glitters", "isn't", "well", "END"])))
ans_num_corpus_words = len(ans_test_corpus_words)# Test correct number of words
assert(num_corpus_words == ans_num_corpus_words), "Incorrect number of distinct words. Correct: {}. Yours: {}".format(ans_num_corpus_words, num_corpus_words)# Test correct words
assert (test_corpus_words == ans_test_corpus_words), "Incorrect corpus_words.\nCorrect: {}\nYours: {}".format(str(ans_test_corpus_words), str(test_corpus_words))# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)
結果如下:
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
def compute_co_occurrence_matrix(corpus, window_size=4):""" Compute co-occurrence matrix for the given corpus and window_size (default of 4).Note: Each word in a document should be at the center of a window. Words near edges will have a smallernumber of co-occurring words.For example, if we take the document "START All that glitters is not gold END" with window size of 4,"All" will co-occur with "START", "that", "glitters", "is", and "not".Params:corpus (list of list of strings): corpus of documentswindow_size (int): size of context windowReturn:M (numpy matrix of shape (number of corpus words, number of corpus words)): Co-occurence matrix of word counts. The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M."""words, num_words = distinct_words(corpus)M = Noneword2Ind = {}# ------------------# Write your implementation here.M = np.zeros((num_words, num_words))word2Ind = {word:i for i, word in enumerate(words)}for doc in corpus:for i, word in enumerate(doc):for j in range(i-window_size, i+window_size+1):if j<0 or j>=len(doc):continueif j != i:M[word2Ind[word], word2Ind[doc[j]]] += 1# ------------------return M, word2Ind
# ---------------------
# Run this sanity check
# Note that this is not an exhaustive check for correctness.
# ---------------------# Define toy corpus and get student's co-occurrence matrix
test_corpus = ["START All that glitters isn't gold END".split(" "), "START All's well that ends well END".split(" ")]
M_test, word2Ind_test = compute_co_occurrence_matrix(test_corpus, window_size=1)# Correct M and word2Ind
M_test_ans = np.array( [[0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,],[0., 0., 0., 1., 0., 0., 0., 0., 0., 1.,],[0., 0., 0., 0., 0., 0., 1., 0., 0., 1.,],[1., 1., 0., 0., 0., 0., 0., 0., 0., 0.,],[0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,],[0., 0., 0., 0., 0., 0., 0., 1., 1., 0.,],[0., 0., 1., 0., 0., 0., 0., 1., 0., 0.,],[0., 0., 0., 0., 0., 1., 1., 0., 0., 0.,],[1., 0., 0., 0., 1., 1., 0., 0., 0., 1.,],[0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,]]
)
word2Ind_ans = {'All': 0, "All's": 1, 'END': 2, 'START': 3, 'ends': 4, 'glitters': 5, 'gold': 6, "isn't": 7, 'that': 8, 'well': 9}# Test correct word2Ind
assert (word2Ind_ans == word2Ind_test), "Your word2Ind is incorrect:\nCorrect: {}\nYours: {}".format(word2Ind_ans, word2Ind_test)# Test correct M shape
assert (M_test.shape == M_test_ans.shape), "M matrix has incorrect shape.\nCorrect: {}\nYours: {}".format(M_test.shape, M_test_ans.shape)# Test correct M values
for w1 in word2Ind_ans.keys():idx1 = word2Ind_ans[w1]for w2 in word2Ind_ans.keys():idx2 = word2Ind_ans[w2]student = M_test[idx1, idx2]correct = M_test_ans[idx1, idx2]if student != correct:print("Correct M:")print(M_test_ans)print("Your M: ")print(M_test)raise AssertionError("Incorrect count at index ({}, {})=({}, {}) in matrix M. Yours has {} but should have {}.".format(idx1, idx2, w1, w2, student, correct))# Print Success
print ("-" * 80)
print("Passed All Tests!")
print ("-" * 80)
結果如下:
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
def reduce_to_k_dim(M, k=2):""" Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:- http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.htmlParams:M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word countsk (int): embedding size of each word after dimension reductionReturn:M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.In terms of the SVD from math class, this actually returns U * S""" n_iters = 10 # Use this parameter in your call to `TruncatedSVD`M_reduced = Noneprint("Running Truncated SVD over %i words..." % (M.shape[0]))# ------------------# Write your implementation here.svd = TruncatedSVD(n_components=k, n_iter=n_iters)M_reduced = svd.fit_transform(M)# ------------------print("Done.")return M_reduced
打印結果如下:
Running Truncated SVD over 10 words...
Done.
--------------------------------------------------------------------------------
Passed All Tests!
--------------------------------------------------------------------------------
def plot_embeddings(M_reduced, word2Ind, words):""" Plot in a scatterplot the embeddings of the words specified in the list "words".NOTE: do not plot all the words listed in M_reduced / word2Ind.Include a label next to each point.Params:M_reduced (numpy matrix of shape (number of unique words in the corpus , k)): matrix of k-dimensioal word embeddingsword2Ind (dict): dictionary that maps word to indices for matrix Mwords (list of strings): words whose embeddings we want to visualize"""# ------------------# Write your implementation here.for word in words:coord = M_reduced[word2Ind[word]]x = coord[0]y = coord[1]plt.scatter(x,y, marker='x', color='red')plt.text(x, y, word, fontsize=9)plt.show()# ------------------
# ---------------------
# Run this sanity check
# Note that this not an exhaustive check for correctness.
# The plot produced should look like the "test solution plot" depicted below.
# ---------------------print ("-" * 80)
print ("Outputted Plot:")M_reduced_plot_test = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0, 0]])
word2Ind_plot_test = {'test1': 0, 'test2': 1, 'test3': 2, 'test4': 3, 'test5': 4}
words = ['test1', 'test2', 'test3', 'test4', 'test5']
plot_embeddings(M_reduced_plot_test, word2Ind_plot_test, words)print ("-" * 80)
# -----------------------------
# Run This Cell to Produce Your Plot
# ------------------------------
reuters_corpus = read_corpus()
M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcastingwords = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_normalized, word2Ind_co_occurrence, words)
結果如下:
Running Truncated SVD over 8185 words...
Done.
Part 2: Prediction-Based Word Vectors (15 points) 使用SVD降維,將300維降2維
def load_word2vec():""" Load Word2Vec VectorsReturn:wv_from_bin: All 3 million embeddings, each lengh 300"""import gensim.downloader as apiwv_from_bin = api.load("word2vec-google-news-300")vocab = list(wv_from_bin.vocab.keys())print("Loaded vocab size %i" % len(vocab))return wv_from_bin
# -----------------------------------
# Run Cell to Load Word Vectors
# Note: This may take several minutes
# -----------------------------------
wv_from_bin = load_word2vec()
def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):""" Put the word2vec vectors into a matrix M.Param:wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from fileReturn:M: numpy matrix shape (num words, 300) containing the vectorsword2Ind: dictionary mapping each word to its row number in M"""import randomwords = list(wv_from_bin.vocab.keys())print("Shuffling words ...")random.shuffle(words)words = words[:10000]print("Putting %i words into word2Ind and matrix M..." % len(words))word2Ind = {}M = []curInd = 0for w in words:try:M.append(wv_from_bin.word_vec(w))word2Ind[w] = curIndcurInd += 1except KeyError:continuefor w in required_words:try:M.append(wv_from_bin.word_vec(w))word2Ind[w] = curIndcurInd += 1except KeyError:continueM = np.stack(M)print("Done.")return M, word2Ind
# -----------------------------------------------------------------
# Run Cell to Reduce 300-Dimensinal Word Embeddings to k Dimensions
# Note: This may take several minutes
# -----------------------------------------------------------------
M, word2Ind = get_matrix_of_vectors(wv_from_bin)
M_reduced = reduce_to_k_dim(M, k=2)
結果如下:
Shuffling words ...
Putting 10000 words into word2Ind and matrix M ...
Done.
Running Truncated SVD over 10010 words...
Done.
Question 2.6: Guided Analysis of Bias in Word Vectors [written] (1 point) 偏差分析
# Run this cell
# Here `positive` indicates the list of words to be similar to and `negative` indicates the list of words to be
# most dissimilar from.
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man']))
print()
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))