网站访客qq抓取统计系统,软件开发培训机构哪个好,图片制作gif,百度查重软件从稀疏到稠密#xff1a;词嵌入的演进、实现与稀疏混合专家模型的前沿探索 引言#xff1a;词嵌入的本质与演进 在自然语言处理领域#xff0c;词嵌入#xff08;Word Embedding#xff09;已成为连接符号语言与数值计算的桥梁。传统方法如Word2Vec和GloVe虽已普及#x…从稀疏到稠密词嵌入的演进、实现与稀疏混合专家模型的前沿探索引言词嵌入的本质与演进在自然语言处理领域词嵌入Word Embedding已成为连接符号语言与数值计算的桥梁。传统方法如Word2Vec和GloVe虽已普及但词嵌入技术仍在持续演进。本文将深入探讨词嵌入的核心实现并引入稀疏混合专家模型MoE等前沿方法为开发者提供超越传统范式的技术视角。一、词嵌入的数学基础与经典实现1.1 共现矩阵与奇异值分解SVD词嵌入的根源可追溯至分布式假设“出现在相似上下文中的词具有相似含义”。最早的实现基于词-词共现矩阵的奇异值分解。import numpy as np from scipy import sparse from scipy.sparse.linalg import svds import re from collections import defaultdict class SVDWordEmbeddings: 基于SVD的词嵌入实现 def __init__(self, window_size5, min_count5, embedding_dim100): self.window_size window_size self.min_count min_count self.embedding_dim embedding_dim self.word2id {} self.id2word {} self.cooccurrence None self.embeddings None def build_vocabulary(self, texts): 构建词汇表 word_counts defaultdict(int) # 统计词频 for text in texts: tokens self._tokenize(text) for token in tokens: word_counts[token] 1 # 过滤低频词 self.vocab [word for word, count in word_counts.items() if count self.min_count] # 建立索引映射 self.word2id {word: idx for idx, word in enumerate(self.vocab)} self.id2word {idx: word for idx, word in enumerate(self.vocab)} return self.vocab def build_cooccurrence_matrix(self, texts): 构建共现矩阵 vocab_size len(self.vocab) # 使用稀疏矩阵存储节省内存 self.cooccurrence sparse.lil_matrix((vocab_size, vocab_size), dtypenp.float32) for text in texts: tokens self._tokenize(text) token_ids [self.word2id[token] for token in tokens if token in self.word2id] # 滑动窗口计算共现 for i, center_id in enumerate(token_ids): start max(0, i - self.window_size) end min(len(token_ids), i self.window_size 1) for j in range(start, end): if j i: continue context_id token_ids[j] # 距离加权 distance abs(i - j) weight 1.0 / distance self.cooccurrence[center_id, context_id] weight return self.cooccurrence def compute_embeddings(self): 使用SVD计算词嵌入 if self.cooccurrence is None: raise ValueError(请先构建共现矩阵) # 应用PPMI正点互信息转换 ppmi_matrix self._compute_ppmi() # 执行截断SVD U, Sigma, VT svds(ppmi_matrix, kself.embedding_dim) # 获取词向量 self.embeddings U * np.sqrt(Sigma) return self.embeddings def _compute_ppmi(self): 计算PPMI矩阵 coo_matrix self.cooccurrence.tocoo() total_sum coo_matrix.sum() # 计算边缘概率 row_sums np.array(coo_matrix.sum(axis1)).flatten() col_sums np.array(coo_matrix.sum(axis0)).flatten() # 计算PMI rows, cols, values [], [], [] for i, j, v in zip(coo_matrix.row, coo_matrix.col, coo_matrix.data): pmi np.log((v * total_sum) / (row_sums[i] * col_sums[j] 1e-8)) ppmi max(0, pmi) # 只保留正值 if ppmi 0: rows.append(i) cols.append(j) values.append(ppmi) return sparse.csr_matrix((values, (rows, cols)), shapecoo_matrix.shape) def _tokenize(self, text): 简单的分词函数 return re.findall(r\b\w\b, text.lower()) def get_vector(self, word): 获取词向量 if word not in self.word2id: return None idx self.word2id[word] return self.embeddings[idx] # 使用示例 if __name__ __main__: # 示例文本 texts [ 自然语言处理是人工智能的重要分支, 深度学习在自然语言处理中应用广泛, 词嵌入是表示词语的向量化方法, 机器学习算法需要数值化的输入 ] svd_emb SVDWordEmbeddings(window_size3, min_count1, embedding_dim50) svd_emb.build_vocabulary(texts) svd_emb.build_cooccurrence_matrix(texts) embeddings svd_emb.compute_embeddings() print(f词汇表大小: {len(svd_emb.vocab)}) print(f嵌入维度: {embeddings.shape}) print(f自然的向量: {svd_emb.get_vector(自然)[:5]}) # 显示前5维1.2 GloVe全局向量表示的优化实现GloVeGlobal Vectors for Word Representation通过最小化加权最小二乘损失函数结合了全局统计信息与局部上下文窗口的优点。import numpy as np from scipy import sparse from scipy.optimize import minimize from tqdm import tqdm class GloVe: GloVe词嵌入的完整实现 def __init__(self, embedding_dim100, x_max100, alpha0.75, learning_rate0.05, epochs50): self.embedding_dim embedding_dim self.x_max x_max # 权重函数截断阈值 self.alpha alpha # 权重函数指数 self.lr learning_rate self.epochs epochs def fit(self, cooccurrence, vocab_size): 训练GloVe模型 # 初始化权重 self.W np.random.randn(vocab_size, self.embedding_dim) * 0.01 self.W_tilde np.random.randn(vocab_size, self.embedding_dim) * 0.01 self.b np.zeros(vocab_size) self.b_tilde np.zeros(vocab_size) # 将稀疏矩阵转为坐标格式 coo cooccurrence.tocoo() indices list(zip(coo.row, coo.col, coo.data)) # 训练循环 losses [] for epoch in range(self.epochs): np.random.shuffle(indices) epoch_loss 0 for i, j, x_ij in tqdm(indices, descfEpoch {epoch1}): # 计算权重 if x_ij self.x_max: weight (x_ij / self.x_max) ** self.alpha else: weight 1.0 # 计算当前预测值 prediction (np.dot(self.W[i], self.W_tilde[j]) self.b[i] self.b_tilde[j]) # 计算损失和梯度 loss weight * (prediction - np.log(x_ij)) ** 2 grad 2 * weight * (prediction - np.log(x_ij)) # 更新参数 self.W[i] - self.lr * grad * self.W_tilde[j] self.W_tilde[j] - self.lr * grad * self.W[i] self.b[i] - self.lr * grad self.b_tilde[j] - self.lr * grad epoch_loss loss losses.append(epoch_loss / len(indices)) print(fEpoch {epoch1}, Loss: {losses[-1]:.4f}) # 合并向量得到最终嵌入 self.embeddings (self.W self.W_tilde) / 2 return self.embeddings, losses def weighting_function(self, x): GloVe权重函数 if x self.x_max: return (x / self.x_max) ** self.alpha return 1.0 # GloVe训练示例 def train_glove_example(): # 创建模拟共现矩阵 vocab_size 1000 embedding_dim 50 # 生成随机共现矩阵实际应用中应从真实数据构建 nnz vocab_size * 10 # 非零元素数量 rows np.random.randint(0, vocab_size, nnz) cols np.random.randint(0, vocab_size, nnz) data np.random.randint(1, 100, nnz) cooccurrence sparse.csr_matrix((data, (rows, cols)), shape(vocab_size, vocab_size)) # 训练GloVe glove GloVe(embedding_dimembedding_dim, epochs10) embeddings, losses glove.fit(cooccurrence, vocab_size) print(f最终嵌入形状: {embeddings.shape}) print(f训练损失变化: {losses}) if __name__ __main__: train_glove_example()二、稀疏混合专家模型词嵌入的新范式2.1 MoE的基本原理与优势稀疏混合专家模型Mixture of Experts, MoE将大型神经网络分解为多个子网络专家通过门控网络动态选择相关专家。这种方法在保持模型容量的同时大幅减少计算量。2.2 基于MoE的词嵌入架构import torch import torch.nn as nn import torch.nn.functional as F import math class SparseMoEEmbedding(nn.Module): 基于稀疏混合专家模型的词嵌入 def __init__(self, vocab_size, embedding_dim, num_experts8, capacity_factor1.0, num_selected_experts2): super().__init__() self.vocab_size vocab_size self.embedding_dim embedding_dim self.num_experts num_experts self.capacity_factor capacity_factor self.num_selected_experts num_selected_experts # 专家网络每个专家是一个独立的嵌入矩阵 self.experts nn.ModuleList([ nn.Embedding(vocab_size, embedding_dim) for _ in range(num_experts) ]) # 门控网络 self.gate nn.Sequential( nn.Linear(embedding_dim, 256), nn.ReLU(), nn.Linear(256, num_experts), nn.Softmax(dim-1) ) # 基础嵌入用于门控网络输入 self.base_embedding nn.Embedding(vocab_size, embedding_dim) def forward(self, token_ids, return_gating_weightsFalse): 前向传播 token_ids: [batch_size, seq_len] batch_size, seq_len token_ids.shape # 获取基础嵌入用于门控计算 base_embeds self.base_embedding(token_ids) # [batch, seq_len, embed_dim] # 展平以进行批量处理 flat_token_ids token_ids.view(-1) flat_base_embeds base_embeds.view(-1, self.embedding_dim) # 计算门控权重 gate_weights self.gate(flat_base_embeds) # [batch*seq_len, num_experts] # 选择top-k专家 topk_weights, topk_indices torch.topk( gate_weights, self.num_selected_experts, dim-1 ) # 归一化权重 topk_weights topk_weights / topk_weights.sum(dim-1, keepdimTrue) # 初始化输出 output torch.zeros( batch_size * seq_len, self.embedding_dim, devicetoken_ids.device ) # 计算每个token的容量负载均衡 expert_capacity int( self.capacity_factor * (batch_size * seq_len) / self.num_experts ) # 路由tokens到专家 expert_counts torch.zeros(self.num_experts, devicetoken_ids.device) token_assignments [] for token_idx in range(len(flat_token_ids)): for expert_rank in range(self.num_selected_experts): expert_idx topk_indices[token_idx, expert_rank].item() if expert_counts[expert_idx] expert_capacity: # 将token分配给该专家 weight topk_weights[token_idx, expert_rank] expert_output self.experts[expert_idx](flat_token_ids[token_idx].unsqueeze(0)) output[token_idx] weight * expert_output.squeeze(0) expert_counts[expert_idx] 1 break # 重构成原始形状 output output.view(batch_size, seq_len, self.embedding_dim) # 计算负载均衡损失鼓励均匀使用专家 if self.training: load_balance_loss self._compute_load_balance_loss(gate_weights) else: load_balance_loss torch.tensor(0.0, devicetoken_ids.device) if return_gating_weights: return output, load_balance_loss, gate_weights.view(batch_size, seq_len, -1) return output, load_balance_loss def _compute_load_balance_loss(self, gate_weights): 计算负载均衡损失 # 门控权重的平均值 mean_gate gate_weights.mean(dim0) # 专家选择频率的方差 load_balance_loss torch.var(mean_gate) return load_balance_loss class MoETransformerLayer(nn.Module): 结合MoE的Transformer层 def __init__(self, d_model, nhead, dim_feedforward, num_experts4, dropout0.1): super().__init__() self.self_attn nn.MultiheadAttention(d_model, nhead, dropoutdropout) self.moe_ffn SparseMoEEmbedding( vocab_size10000, # 实际使用时需要调整 embedding_dimdim_feedforward, num_expertsnum_experts ) self.norm1 nn.LayerNorm(d_model) self.norm2 nn.LayerNorm(d_model) self.dropout nn.Dropout(dropout) # 用于将注意力输出映射到FFN输入的投影层 self.projection nn.Linear(d_model, dim_feedforward) def forward(self, src, src_maskNone, src_key_padding_maskNone): # 自注意力 attn_output, _ self.self_attn( src, src, src, attn_masksrc_mask, key_padding_masksrc_key_padding_mask ) src src self.dropout(attn_output) src self.norm1(src) # MoE前馈网络 projected self.projection(src) moe_output, load_balance_loss self.moe_ffn( torch.arange(src.shape[1]).unsqueeze(0).repeat(src.shape[0], 1).to(src.device) ) src src self.dropout(moe_output) src self.norm2(src) return src, load_balance_loss # 使用示例 if __name__ __main__: # 创建模拟数据