word2vec工具主要包含两个模型:跳字模型(skip-gram)和连续词袋模型(continuous bag of words,简称CBOW),以及两种高效训练的方法:负采样(negative sampling)和层序softmax(hierarchical softmax)。值得一提的是,word2vec词向量可以较好地表达不同词之间的相似和类比关系。
index = list(range(total)) np.random.shuffle(index)
all_texts = [] all_labels = [] for i in index: all_texts.append(texts[i]) all_labels.append(labels[i])
label2id = {} for i in range(total): label = str(all_labels[i]) if label not in label2id: label2id[label] = [i] else: label2id[label].append(i)
all_index = [[] for _ in range(fold_num)] for label, data in label2id.items(): # print(label, len(data)) batch_size = int(len(data) / fold_num) other = len(data) - batch_size * fold_num for i in range(fold_num): cur_batch_size = batch_size + 1if i < other else batch_size # print(cur_batch_size) batch_data = [data[i * batch_size + b] for b in range(cur_batch_size)] all_index[i].extend(batch_data)
batch_size = int(total / fold_num) other_texts = [] other_labels = [] other_num = 0 start = 0 for fold in range(fold_num): num = len(all_index[fold]) texts = [all_texts[i] for i in all_index[fold]] labels = [all_labels[i] for i in all_index[fold]]
if num > batch_size: fold_texts = texts[:batch_size] other_texts.extend(texts[batch_size:]) fold_labels = labels[:batch_size] other_labels.extend(labels[batch_size:]) other_num += num - batch_size elif num < batch_size: end = start + batch_size - num fold_texts = texts + other_texts[start: end] fold_labels = labels + other_labels[start: end] start = end else: fold_texts = texts fold_labels = labels
assertbatch_size == len(fold_labels)
# shuffle index = list(range(batch_size)) np.random.shuffle(index)
shuffle_fold_texts = [] shuffle_fold_labels = [] for i in index: shuffle_fold_texts.append(fold_texts[i]) shuffle_fold_labels.append(fold_labels[i])
data = {'label': shuffle_fold_labels, 'text': shuffle_fold_texts} fold_data.append(data)
logging.info("Fold lens %s", str([len(data['label']) for data in fold_data]))
return fold_data
fold_data = all_data2fold(10)
## 3 # build train data for word2vec fold_id = 9
train_texts = [] for i in range(0, fold_id): data = fold_data[i] train_texts.extend(data['text']) logging.info('Total %d docs.' % len(train_texts))
## 4 logging.info('Start training...') from gensim.models.word2vec import Word2Vec
num_features = 100# Word vector dimensionality num_workers = 8# Number of threads to run in parallel