Skip to content
This repository was archived by the owner on Oct 31, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions src/evaluation/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import numpy as np
import torch

import IPython as ipy

from ..utils import to_cuda, restore_segmentation, concat_batches
from ..model.memory import HashingMemory

Expand Down Expand Up @@ -395,6 +397,102 @@ def evaluate_mlm(self, scores, data_set, lang1, lang2):
eval_memory_usage(scores, '%s_%s_%s' % (data_set, l1l2, mem_name), mem_att, params.mem_size)


# get word similarity score
src_emb = params.src_emb
if model.asm_input is False:
tgt_emb = model.embeddings.weight.clone().to(0)
else:
tgt_emb = model.pred_layer.embed_asm_input(torch.cuda.LongTensor(range(model.pred_layer.n_words))).clone().to(0)

src_emb = src_emb.cpu().float()
tgt_emb = tgt_emb.cpu().float()
src_emb = src_emb / src_emb.norm(2, 1, keepdim=True).expand_as(src_emb)
tgt_emb = tgt_emb / tgt_emb.norm(2, 1, keepdim=True).expand_as(tgt_emb)
tgt_emb[tgt_emb != tgt_emb] = 0
n_eval = 10000

if params.lgs == 'en':
src_emb = src_emb[:n_eval]
scs = src_emb.mm(tgt_emb.transpose(0,1)).float()
scs = scs.argmax(dim=1).cpu().numpy()
acc = 100 * (scs == np.arange(n_eval)).sum() / n_eval
else:
# get ID-to-token mapping
src_id2word = {k:v for k, v in torch.load('/private/home/aconneau/projects/XLM/data/scaling-xlm/en_cc/50k/en.id2word.pth').items() if k < self.params.max_vocab}
tgt_id2word = {k:v for k, v in self.dico.id2word.items() if k < self.params.max_vocab}

# load aligned vectors and use their ID-to-token mapping
# src_emb = torch.tensor([list(map(float,line.strip().split()[1:])) for i,line in enumerate(open('/private/home/pmulc/MUSE/dumped/debug/uqtso4f8bw/vectors-en.txt','r')) if i > 0])
# tgt_emb = torch.tensor([list(map(float,line.strip().split()[1:])) for i,line in enumerate(open('/private/home/pmulc/MUSE/dumped/debug/uqtso4f8bw/vectors-fr.txt','r')) if i > 0])
# src_id2word = {i-1:line.split()[0] for i,line in enumerate(open('/private/home/pmulc/MUSE/dumped/debug/uqtso4f8bw/vectors-en.txt','r')) if i > 0}
# tgt_id2word = {i-1:line.split()[0] for i,line in enumerate(open('/private/home/pmulc/MUSE/dumped/debug/uqtso4f8bw/vectors-fr.txt','r')) if i > 0}

# if using outside vectors, normalize again
# src_emb = src_emb / src_emb.norm(2, 1, keepdim=True).expand_as(src_emb)
# tgt_emb = tgt_emb / tgt_emb.norm(2, 1, keepdim=True).expand_as(tgt_emb)

# load dictionary
tgt_lang = params.lgs
alignment_pairs = []
src_dict_words = set()
tgt_dict_words = set()
for line in open(f'/private/home/aconneau/projects/MUSE/data/crosslingual/dictionaries/en-{tgt_lang}.5000-6500.txt','r'):
wsrc, wtgt = line.strip().split()
src_dict_words.add(wsrc)
tgt_dict_words.add(wtgt)
alignment_pairs.append((wsrc,wtgt))

# filter ID-to-token mapping to get rid of non-word BPE tokens
src_words_indices = [(v[1:],k) for k,v in src_id2word.items() if v[0] == chr(9601) and v[1:] in src_dict_words]
# src_words_indices = [(v,k) for k,v in src_id2word.items() if v in src_dict_words]
src_words = [p[0] for p in src_words_indices]
src_indices = [p[1] for p in src_words_indices]
src_id2word = {i:v for i,v in enumerate(src_words)}

tgt_words_indices = [(v[1:],k) for k,v in tgt_id2word.items() if v[0] == chr(9601) and v[1:] in tgt_dict_words]
# tgt_words_indices = [(v,k) for k,v in tgt_id2word.items() if v in tgt_dict_words]
tgt_words = [p[0] for p in tgt_words_indices]
tgt_indices = [p[1] for p in tgt_words_indices]
tgt_id2word = {i:v for i,v in enumerate(tgt_words)}

# create word-to-ID mapping from filtered ID-to-token mapping
src_word2id = {v:k for k,v in src_id2word.items()}
tgt_word2id = {v:k for k,v in tgt_id2word.items()}

# make dictionary tensor
alignment_pairs = [p for p in alignment_pairs if p[0] in src_word2id and p[1] in tgt_word2id]
alignment_dict = torch.LongTensor(len(alignment_pairs), 2)
for i, pair in enumerate(alignment_pairs):
word1, word2 = pair
alignment_dict[i,0] = src_word2id[word1]
alignment_dict[i,1] = tgt_word2id[word2]

# select embeddings from filtered indices
src_emb = src_emb[src_indices]
tgt_emb = tgt_emb[tgt_indices]
src_emb = src_emb.cpu().float()
tgt_emb = tgt_emb.cpu().float()

# get dot product scores and find nearest neighbors
src_query = src_emb[alignment_dict[:, 0]]
scs = src_query.mm(tgt_emb.transpose(0,1)).float()
src_scs = scs.topk(k=1,dim=1)[1].cpu()

for k in [1]: #, 5, 10]:
top_k_matches = src_scs[:, :k]
_matching = (src_scs == alignment_dict[:, 1][:, None].expand_as(src_scs)).sum(1).cpu().numpy()
# allow for multiple possible translations
matching = {}
for i, src_id in enumerate(alignment_dict[:, 0].cpu().numpy()):
matching[src_id] = min(matching.get(src_id, 0) + _matching[i], 1)
# evaluate precision@k
precision_at_k = 100 * np.mean(list(matching.values()))

acc = precision_at_k

w2v_name = '%s_%s_w2v_p_at_1' % (data_set, l1l2)
scores[w2v_name] = acc

class SingleEvaluator(Evaluator):

def __init__(self, trainer, data, params):
Expand Down
67 changes: 64 additions & 3 deletions src/model/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,10 @@
import torch

from .pretrain import load_embeddings
from .transformer import DECODER_ONLY_PARAMS, TransformerModel # , TRANSFORMER_LAYER_PARAMS
from .transformer import DECODER_ONLY_PARAMS, TransformerModel, Embedding, Linear, N_MAX_POSITIONS # , TRANSFORMER_LAYER_PARAMS
from .memory import HashingMemory

import IPython as ipy

logger = getLogger()

Expand Down Expand Up @@ -44,7 +45,8 @@ def check_model_params(params):
assert 0 <= params.word_blank < 1

# model dimensions
assert params.emb_dim % params.n_heads == 0
model_dim = params.model_dim if params.model_dim != -1 else params.emb_dim
assert model_dim % params.n_heads == 0

# share input and output embeddings
assert params.share_inout_emb is False or params.asm is False
Expand Down Expand Up @@ -99,7 +101,7 @@ def set_pretrain_emb(model, dico, word2id, embeddings):
n_found += 1
model.embeddings.weight[i] = embeddings[idx].cuda()
model.pred_layer.proj.weight[i] = embeddings[idx].cuda()
logger.info("Pretrained %i/%i words (%.3f%%)."
logger.info("Loaded pretrained embs for %i/%i words (%.3f%%)."
% (n_found, len(dico), 100. * n_found / len(dico)))


Expand Down Expand Up @@ -131,8 +133,67 @@ def build_model(params, dico):
# logger.warning("Parameter %s not found. Ignoring ..." % k)
# reloaded[k] = model.state_dict()[k]

# HERE: Re-initialize 'position_embeddings.weight', 'embeddings.weight'

# Keep old parameters of embeddings for source language (English)
if 'embeddings.weight' in reloaded:
params.src_emb = reloaded['embeddings.weight'].clone().to(0)
elif 'pred_layer.proj.head.weight' in reloaded:
if hasattr(model, 'embeddings'):
print("Can't compute adaptive softmax embeddings with a non-asm-input model; just using random embeddings for src_emb")
params.src_emb = Embedding(params.n_words, params.emb_dim, padding_idx=params.pad_index).weight.data.half().cuda().to(0)
else:
model.load_state_dict(reloaded)
inputs = torch.LongTensor(range(len(model.dico)))
params.src_emb = model.pred_layer.embed_asm_input(inputs).clone().to(0)

# "Reload" parameters of embeddings for target language (others) by creating new ones
if params.reinit:
reloaded['position_embeddings.weight'] = Embedding(N_MAX_POSITIONS, params.emb_dim).weight.data.half().cuda().to(0)
if params.asm:
if 'pred_layer.proj.weight' in reloaded:
del reloaded['pred_layer.proj.weight']
del reloaded['pred_layer.proj.bias']
reloaded['pred_layer.proj.head.weight'] = model.pred_layer.proj.head.weight.data.half().cuda().to(0)
reloaded['pred_layer.proj.head.bias'] = model.pred_layer.proj.head.bias.data.half().cuda().to(0)
for i in range(len(model.pred_layer.proj.cutoffs)-1):
reloaded[f'pred_layer.proj.tail.{i}.0.weight'] = model.pred_layer.proj.tail[i][0].weight.data.half().cuda().to(0)
reloaded[f'pred_layer.proj.tail.{i}.1.weight'] = model.pred_layer.proj.tail[i][1].weight.data.half().cuda().to(0)
else:
for key in reloaded:
if 'pred_layer.proj.head' in key or 'pred_layer.proj.tail' in key:
del reloaded[key]
if params.asm_input:
if 'embeddings.weight' in reloaded:
del reloaded['embeddings.weight']
else:
reloaded['embeddings.weight'] = Embedding(params.n_words, params.emb_dim, padding_idx=params.pad_index).weight.data.half().cuda().to(0)

if 'post_embed_proj.weight' in reloaded:
pep = Linear(params.emb_dim, params.model_dim)
reloaded['post_embed_proj.weight'] = pep.weight.data.half().cuda().to(0)
reloaded['post_embed_proj.bias'] = pep.bias.data.half().cuda().to(0)
if 'post_embed_pos_proj.weight' in reloaded:
pepp = Linear(params.emb_dim, params.model_dim)
reloaded['post_embed_pos_proj.weight'] = pepp.weight.data.half().cuda().to(0)
reloaded['post_embed_pos_proj.bias'] = pepp.bias.data.half().cuda().to(0)
if 'embeddings.weight' in reloaded and params.share_inout_emb:
# tie input/output embeddings
reloaded['pred_layer.proj.weight'] = reloaded['embeddings.weight'].clone().to(0)
reloaded['pred_layer.proj.bias'] = Linear(params.emb_dim, params.n_words, bias=True).bias.data.half().cuda().to(0)
model.load_state_dict(reloaded)

if params.reload_emb != '':
# load embeddings from file
word2id, embeddings = load_embeddings(params.reload_emb, params)
# rescale embeddings to match pretrained model.
# this is only approximate as some words don't have preloaded embs, but the initialized values should be close to the desired scale anyway
loaded_mean = embeddings.abs().mean()
pretrained_mean = params.src_emb.abs().mean()
embeddings = embeddings/(loaded_mean/pretrained_mean)
# update model
set_pretrain_emb(model, dico, word2id, embeddings)

logger.info("Model: {}".format(model))
logger.info("Number of parameters (model): %i" % sum([p.numel() for p in model.parameters() if p.requires_grad]))

Expand Down
59 changes: 50 additions & 9 deletions src/model/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,13 +110,13 @@ def __init__(self, params):
self.asm = params.asm
self.n_words = params.n_words
self.pad_index = params.pad_index
dim = params.emb_dim
self.dim = params.emb_dim

if params.asm is False:
self.proj = Linear(dim, params.n_words, bias=True)
self.proj = Linear(self.dim, params.n_words, bias=True)
else:
self.proj = nn.AdaptiveLogSoftmaxWithLoss(
in_features=dim,
in_features=self.dim,
n_classes=params.n_words,
cutoffs=params.asm_cutoffs,
div_value=params.asm_div_value,
Expand Down Expand Up @@ -145,6 +145,22 @@ def get_scores(self, x):
assert x.dim() == 2
return self.proj.log_prob(x) if self.asm else self.proj(x)

def embed_asm_input(self, input):
embeddings = torch.zeros(input.shape + (self.dim,), dtype=self.proj.head.weight.dtype)
if input.device.type == 'cuda':
embeddings = embeddings.cuda(device=input.device)
for i in range(len(self.proj.cutoffs)):
mask = input.lt(self.proj.cutoffs[i])
if i == 0:
split_input = input[mask]
embeddings[mask] = nn.functional.embedding(split_input, self.proj.head.weight)
else:
mask.mul_(input.ge(self.proj.cutoffs[i - 1]))
split_input = input[mask] - self.proj.cutoffs[i - 1]
split_embs = nn.functional.embedding(split_input, self.proj.tail[i-1][1].weight)
embeddings[mask] = nn.functional.linear(split_embs, self.proj.tail[i-1][0].weight.t())
return embeddings


class MultiHeadAttention(nn.Module):

Expand Down Expand Up @@ -264,7 +280,8 @@ def __init__(self, params, dico, is_encoder, with_output):
assert len(self.id2lang) == len(self.lang2id) == self.n_langs

# model parameters
self.dim = params.emb_dim # 512 by default
self.dim = params.model_dim if params.model_dim != -1 else params.emb_dim
self.emb_dim = params.emb_dim # 512 by default
self.hidden_dim = self.dim * 4 # 2048 by default
self.n_heads = params.n_heads # 8 by default
self.n_layers = params.n_layers
Expand All @@ -273,12 +290,23 @@ def __init__(self, params, dico, is_encoder, with_output):
assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'

# embeddings
self.position_embeddings = Embedding(N_MAX_POSITIONS, self.dim)
if params.asm and params.asm_input:
self.asm_input = True
else:
self.asm_input = False
self.embeddings = Embedding(self.n_words, self.emb_dim, padding_idx=self.pad_index)
self.position_embeddings = Embedding(N_MAX_POSITIONS, self.emb_dim)
if params.sinusoidal_embeddings:
create_sinusoidal_embeddings(N_MAX_POSITIONS, self.dim, out=self.position_embeddings.weight)
create_sinusoidal_embeddings(N_MAX_POSITIONS, self.emb_dim, out=self.position_embeddings.weight)
if params.n_langs > 1 and self.use_lang_emb:
self.lang_embeddings = Embedding(self.n_langs, self.dim)
self.embeddings = Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
if self.dim != self.emb_dim:
if params.n_langs > 1 and self.use_lang_emb:
raise NotImplementedError("for multilingual projected training, implement language-specific projections")
self.post_embed_proj = Linear(self.emb_dim, self.dim, bias=True)
self.post_embed_pos_proj = Linear(self.emb_dim, self.dim, bias=True)
else:
self.post_embed_proj = None
self.layer_norm_emb = nn.LayerNorm(self.dim, eps=1e-12)

# transformer layers
Expand Down Expand Up @@ -380,8 +408,18 @@ def fwd(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, la
attn_mask = attn_mask[:, -_slen:]

# embeddings
tensor = self.embeddings(x)
tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
if self.asm_input:
tensor = self.pred_layer.embed_asm_input(x)
else:
tensor = self.embeddings(x)

# post-embedding projections, if applicable
if self.post_embed_proj is not None:
tensor = self.post_embed_proj(tensor)
tensor += self.post_embed_pos_proj(self.position_embeddings(positions)).expand_as(tensor)
else:
tensor += self.position_embeddings(positions).expand_as(tensor)

if langs is not None and self.use_lang_emb:
tensor = tensor + self.lang_embeddings(langs)
tensor = self.layer_norm_emb(tensor)
Expand Down Expand Up @@ -436,6 +474,9 @@ def predict(self, tensor, pred_mask, y, get_scores):
`get_scores` is a boolean specifying whether we need to return scores
"""
masked_tensor = tensor[pred_mask.unsqueeze(-1).expand_as(tensor)].view(-1, self.dim)
if self.post_embed_proj:
# project down to small embedding dimension again
masked_tensor = nn.functional.linear(masked_tensor, self.post_embed_proj.weight.t())
scores, loss = self.pred_layer(masked_tensor, y, get_scores)
return scores, loss

Expand Down
Loading