Semantic Representations
A tutorial on Semantic Representations in NLP and there use in Document Ranking & Classification.
Introduction
Word Embeddings are used to represent words in a multi-dimensional vector form. A word wi in vocabulary V is represented in the form of a vector of n dimensions. These vectors are generated by unsupervised training on a large corpus of words to gain the semantic similarities between them.
import sys
!mkdir vectors
# Download the different files using these commands. This may take a while
!cd vectors && curl -O http://magnitude.plasticity.ai/word2vec/light/GoogleNews-vectors-negative300.magnitude
# !cd vectors && curl -O http://magnitude.plasticity.ai/glove/light/glove.6B.50d.magnitude
!cd vectors && curl -O http://magnitude.plasticity.ai/elmo/light/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude
!ls
!cd vectors && ls
Install the required libraries. Again this may take a while
!pip3 install torch numpy scikit-learn pandas transformers==3.1.0 seaborn matplotlib sentence_transformers
import torch
torch.__version__
Since pymagnitude can not installed properly with pypi, we'll use another method for it. Install Magnitude on Google Colab
! echo "Installing Magnitude.... (please wait, can take a while)"
! (curl https://raw.githubusercontent.com/plasticityai/magnitude/master/install-colab.sh | /bin/bash 1>/dev/null 2>/dev/null)
! echo "Done installing Magnitude."
import pymagnitude as pym
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
def similarity_between_docs(doc1, doc2, is_1d=False):
if is_1d:
v1 = np.reshape(doc1, (1, -1))
v2 = np.reshape(doc2, (1, -1))
else:
d1 = np.mean(doc1, axis=0)
d2 = np.mean(doc2, axis=0)
v1 = np.reshape(d1, (1, -1))
v2 = np.reshape(d2, (1, -1))
return cosine_similarity(v1, v2)[0][0]
def plot_1d_heatmap(vec, name):
v = vec.reshape(1, -1)
plt.figure(figsize=(20, 2))
sns.heatmap(v, cmap="YlGnBu").set_title(name)
plt.rcParams.update({"font.size": 22})
plt.show()
return
# glove_vectors = pym.Magnitude("./vectors/glove.6B.50d.magnitude")
w2v_vectors = pym.Magnitude("./vectors/GoogleNews-vectors-negative300.magnitude")
For using other vectors, download the pre-trained vectors from pymagnitude
repo and put them in ./vectors
folder
print("Vector Name: {}\nTotal words: {}\nDimension of each word: {}".
format("Word2Vec", len(w2v_vectors), w2v_vectors.dim))
for i, (key, vec) in enumerate(w2v_vectors):
if i == 1000:
print("Index = {}\Word: {}\nVector Size: {}\nVector: {}".format(i, key, vec.shape, vec))
break
print(w2v_vectors.query("dog"))
# Get the vector using the index
print(w2v_vectors[1000])
doc_vecs = w2v_vectors.query(["I", "read", "a", "book"])
doc_vecs.shape
mul_doc_vecs = w2v_vectors.query([["I", "read", "a", "book"], ["I", "read", "a", "sports", "magazine"]])
mul_doc_vecs.shape
print("Similarity between \"Apple\" and \"Mango\": {}".
format(w2v_vectors.similarity("apple", "mango")))
print("Similarity between \"Apple\" and [\"Mango\", \"Orange\"]: {}".
format(w2v_vectors.similarity("apple", ["mango", "orange"])))
print("Most similar to \"Cat\" from [\"Dog\", \"Television\", \"Laptop\"]: {}".
format(w2v_vectors.most_similar_to_given("cat", ["dog", "television", "laptop"])))
doc1 = w2v_vectors.query(["I", "read", "a", "book"])
doc2 = w2v_vectors.query(["I", "read", "a", "sports", "magazine"])
print("Similarity between\n\"I read a book\" and \"I read a sports magazine\": {}".
format(similarity_between_docs(doc1, doc2, is_1d=False)))
plot_1d_heatmap(w2v_vectors.query("king"), "King")
plot_1d_heatmap(w2v_vectors.query("man"), "Man")
plot_1d_heatmap(w2v_vectors.query("woman"), "Woman")
plot_1d_heatmap(w2v_vectors.query("queen"), "Queen")
tmp = w2v_vectors.query("king") - w2v_vectors.query("man") + w2v_vectors.query("woman")
plot_1d_heatmap(tmp, "King - Man + Woman")
print("Similarity between\n\"King - Man + Woman\" and \"Queen\": {}".
format(similarity_between_docs(tmp, w2v_vectors.query("queen"), is_1d=True)))
"""
Try plotting heatmap for four different words, out of which three should have
same property and should be different. For example, "girl", "boy", "man", "water
"""
# Your code here
"""
Calculate the similarity between two words with similar sense and two with no
similarity. For example, similarity between "cat" & "dog" and similarity between
"apple" and "lion"
"""
# Your code here
"""
Print the similarity score of "paris" with following:
"delhi", "vienna", "london", france, "laptop"
"""
# Your code here
elmo_vecs = pym.Magnitude('./vectors/elmo_2x1024_128_2048cnn_1xhighway_weights.magnitude')
ELMo generates embedding of a word based on its context. So we need to provide a full sentence in order to get the embedding of some word.
sen1 = elmo_vecs.query(["yes", "they", "are", "right"])
sen2 = elmo_vecs.query(["go", "to", "your", "right"])
right1 = sen1[-1]
right2 = sen2[-1]
print("right from sentence 1: {}\tright from sentence 2: {}".format(right1.shape, right2.shape))
plot_1d_heatmap(right1, name="ELMo vec for right in \"yes they are right\"")
plot_1d_heatmap(right2, name="ELMo vec for right in \"go to your right\"")
print("Simialrity between \"right\" from sentence 1 & 2:\t{}".
format(similarity_between_docs(right1, right2, is_1d=True)))
print("Simialrity between \"right\" from sentence 1 only:\t{}".
format(similarity_between_docs(right1, right1, is_1d=True)))
print("Simialrity between \"right\" from sentence 2 only:\t{}".
format(similarity_between_docs(right2, right2, is_1d=True)))
Google's BERT
Since pymagnitude
doesn't have support for BERT yet, we'll use the huggingface's transfomers
library for this.
import torch
import transformers
# this may take a while for first time
model_class, tokenizer_class, pretrained_weights = (transformers.BertModel,
transformers.BertTokenizer, 'bert-base-uncased')
# Load pretrained model/tokenizer
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
tokenized1 = tokenizer.encode("yes they are right", add_special_tokens=False)
tokenized2 = tokenizer.encode("go to your right", add_special_tokens=False)
print(tokenized1, tokenized2)
# you can also get the full sentence using the token_ids
print(tokenizer.decode(tokenized1))
print(tokenizer.decode(tokenized2))
input_ids = torch.tensor([tokenized1, tokenized2])
model.eval()
with torch.no_grad():
outputs = model(input_ids)
last_hidden_states = outputs[0]
right1_bert = (last_hidden_states[0][-1]).numpy()
right2_bert = (last_hidden_states[1][-1]).numpy()
print(right1_bert.shape, right2_bert.shape)
plot_1d_heatmap(right1_bert, name="BERT vec for right in \"yes they are right\"")
plot_1d_heatmap(right2_bert, name="BERT vec for right in \"go to your right\"")
print("Simialrity between \"right\" from sentence 1 & 2 using BERT:\t{}".
format(similarity_between_docs(right1_bert, right2_bert, is_1d=True)))
print("Simialrity between \"right\" from sentence 1 only using BERT:\t{}".
format(similarity_between_docs(right1_bert, right1_bert, is_1d=True)))
print("Simialrity between \"right\" from sentence 2 only using BERT:\t{}".
format(similarity_between_docs(right2_bert, right2_bert, is_1d=True)))
!mkdir data
!cd data && curl -O https://raw.githubusercontent.com/ashishu007/Word-Embeddings/master/data/abstracts.csv
!cd data && curl -O https://raw.githubusercontent.com/ashishu007/Word-Embeddings/master/data/train.tsv
dfa = pd.read_csv('./data/abstracts.csv')
print(dfa.shape)
dfa = dfa[:50]
dfa.head(5)
import nltk
nltk.download('punkt')
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
torch.__version__
def gen_w2v_embs(row):
# row = row['text']
# print(row)
tokens = nltk.word_tokenize(row)
token_words = [w for w in tokens if w.isalpha()]
# print(token_words)
stemming = PorterStemmer()
tokens_stemmed = [stemming.stem(word) for word in token_words]
# print(tokens_stemmed)
# stops = set(stopwords.words("english"))
stops = ["a", "an", "the"]
meaningful_words = [w for w in tokens_stemmed if not w in stops]
# print(meaningful_words)
vecs = []
for w in meaningful_words:
w_vec = w2v_vectors.query(w)
vecs.append(w_vec)
vec_arr = np.array(vecs)
vec_final = np.mean(vec_arr, axis=0, dtype="float32")
return vec_final
# Here we use a different library called `sentence_transformers` beacuse this
# library is easier than `transformers` for sentence embeddings
def gen_bert_embs(col):
bert_model = SentenceTransformer('distilbert-base-nli-mean-tokens')
bert_embs = bert_model.encode(col)
return bert_embs
w2v_abs = dfa["content"].apply(gen_w2v_embs)
w2v_abs = (torch.tensor(w2v_abs)).numpy()
elmo_abs = dfa["content"].apply((lambda x: elmo_vecs.query(x)))
elmo_abs = (torch.tensor(elmo_abs)).numpy()
bert_abs = gen_bert_embs(dfa["content"])
w2v_abs.shape, elmo_abs.shape, bert_abs.shape
type(w2v_abs), type(elmo_abs), type(bert_abs)
def gen_query_emb(q, emb="w2v"):
if emb == "w2v":
query_emb = gen_w2v_embs(q)
elif emb == "elmo":
query_emb = elmo_vecs.query(q)
elif emb == "bert":
query_bert = gen_bert_embs(q)
query_emb = query_bert.reshape(-1)
return query_emb
q1 = gen_query_emb("documents that discuss learning methods", emb="elmo")
q2 = gen_query_emb("documents that discuss learning methods", emb="bert")
q3 = gen_query_emb("documents that discuss learning methods", emb="w2v")
q1.shape, q2.shape, q3.shape
def get_doc_similarity(q, docs):
sims = {}
for i, doc in enumerate(docs):
sim_score = similarity_between_docs(q, doc, is_1d=True)
sims[i] = sim_score
sims_sorted = {k: v for k, v in sorted(sims.items(), key=lambda item: item[1], reverse=True)}
return sims_sorted
s = get_doc_similarity(q2, bert_abs)
# s
ss = list(s.keys())[:10]
ss
dfa["content"][17]
"""
Try ranking same documents using another natural language query and ranking algorithm
"""
# Your code here
"""
Print top 3 documents ranked by each query and algorithm. Compare their results
"""
# Your code here
"""
Plot the heatmap for a sentence using BERT and ELMo
"""
# Your code here
"""
Plot the heatmap for same sentence using Word2Vec
"""
# Your code here
df = pd.read_csv('./data/train.tsv', delimiter='\t', names=["text", "label"])
print(df.shape)
df = df[:200]
df.head(5)
w2vs = df["text"].apply(gen_w2v_embs)
w2v_embs = (torch.tensor(w2vs)).numpy()
w2v_embs.shape
elmos = df["text"].apply((lambda x: elmo_vecs.query(x)))
elmo_embs = (torch.tensor(elmos)).numpy()
elmo_embs.shape
bert_embs = gen_bert_embs(df["text"])
bert_embs.shape
labels = df["label"]
from sklearn.model_selection import train_test_split
X_train_w2v, X_test_w2v, y_train_w2v, y_test_w2v = train_test_split(w2v_embs, labels,
test_size=0.33,
random_state=42, stratify=labels)
X_train_elmo, X_test_elmo, y_train_elmo, y_test_elmo = train_test_split(elmo_embs, labels,
test_size=0.33,
random_state=42, stratify=labels)
X_train_bert, X_test_bert, y_train_bert, y_test_bert = train_test_split(bert_embs, labels,
test_size=0.33,
random_state=42, stratify=labels)
from sklearn.linear_model import LogisticRegression
lr_clf_w2v = LogisticRegression()
lr_clf_w2v.fit(X_train_w2v, y_train_w2v)
lr_clf_elmo = LogisticRegression()
lr_clf_elmo.fit(X_train_elmo, y_train_elmo)
lr_clf_bert = LogisticRegression()
lr_clf_bert.fit(X_train_bert, y_train_bert)
y_pred_w2v = lr_clf_w2v.predict(X_test_w2v)
y_pred_elmo = lr_clf_elmo.predict(X_test_elmo)
y_pred_bert = lr_clf_bert.predict(X_test_bert)
from sklearn.metrics import accuracy_score, f1_score
print("Word2Vec\tAccuracy: {}\tMacro F1: {}".format(accuracy_score(y_pred_w2v, y_test_w2v),
f1_score(y_pred_w2v, y_test_w2v, average="macro")))
print("ELMo\tAccuracy: {}\tMacro F1: {}".format(accuracy_score(y_pred_elmo, y_test_elmo),
f1_score(y_pred_elmo, y_test_elmo, average="macro")))
print("BERT\tAccuracy: {}\tMacro F1: {}".format(accuracy_score(y_pred_bert, y_test_bert),
f1_score(y_pred_bert, y_test_bert, average="macro")))