deffind_english(raw_data): pattern = re.compile(r'[^a-zA-Z]') english = re.sub(pattern, ' ', raw_data) return english
1 2 3 4 5 6 7 8 9
raw_sentence_english='''THE recent success of neural networks has boosted research on pattern recognition and data mining. Many machine learning tasks such as object detection [1], [2], machine translation [3], [4], and speech reconition [5], which once heavily relied on handcrafted feature engineering to extract informative feature sets, has recently been revolutionized by various end-to-end deep learning paradigms, e.g., convolutional neural networks (CNNs) [6], recurrent neural networks (RNNs) [7], and autoencoders [8].'''
'THE recent success of neural networks has boosted research on pattern recognition and data mining Many machine learning tasks such as object detection machine translation and speech reconition which once heavily relied on handcrafted feature engineering to extract informative feature sets has recently been revolutionized by various end to end deep learning paradigms e g convolutional neural networks CNNs recurrent neural networks RNNs and autoencoders '
defchinese_splitwords(sentence_chinese, suggestions_list=None): if suggestions_list isnotNone: for item in suggestions_list: jieba.add_word(item) splited_chinese=[item for item in jieba.cut(sentence_chinese)] return splited_chinese
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\Yufei Luo\AppData\Local\Temp\jieba.cache
Loading model cost 0.562 seconds.
Prefix dict has been built successfully.
for item in splited_sentence_english: if enchant_dict.check(item) isFalse: print('original word: ',item, ' suggested words: ', enchant_dict.suggest(item))
original word: reconition suggested words: ['recondition', 'recognition', 'reconnection', 'reconception', 'reconsecration', 'premonition']
original word: convolutional suggested words: ['convolution al', 'convolution-al', 'convolution', 'involutional', 'convocational', 'coevolutionary']
original word: cnns suggested words: ['conns', 'inns', 'cans', 'cons', 'CNS']
original word: rnns suggested words: ['inns', 'runs', 'scorns']
original word: autoencoders suggested words: ['auto encoders', 'auto-encoders', 'encoders']
from nltk.stem import SnowballStemmer stemmer=SnowballStemmer('english') stemmed_sentence_english=[stemmer.stem(item) for item in filtered_splited_sentence_english]
from nltk.stem import WordNetLemmatizer lemmatizer=WordNetLemmatizer() lemmatized_sentence_english=[lemmatizer.lemmatize(item) for item in filtered_splited_sentence_english]
raw_sentences=[r'Reinforcement learning is learning what to do—how to map situations to actions—so as to maximize a numerical reward signal.', r'The learner is not told which actions to take, but instead must discover which actions yield the most reward by trying them.', r'In the most interesting and challenging cases, actions may a↵ect not only the immediate reward but also the next situation and, through that, all subsequent rewards.', r'These two characteristics—trial-and-error search and delayed reward—are the two most important distinguishing features of reinforcement learning.']
1
processed_sentences=[preprocess_english_sentence(item) for item in raw_sentences]
defgenerate_one_hot_encoding(sentences): token_index={} for sentence in sentences: for word in sentence: if word notin token_index: token_index[word]=len(token_index) results=np.zeros((len(sentences),len(token_index))) for i inrange(len(sentences)): for word in sentences[i]: results[i,token_index[word]]=1 return results
下面的示例是对句子做Bag of Words编码,相比于One-hot编码,它加入了对词频的统计信息:
1 2 3 4 5 6 7 8 9 10 11 12 13
defgenerate_bag_of_words_encoding(sentences): token_index={} for sentence in sentences: for word in sentence: if word notin token_index: token_index[word]=len(token_index) results=np.zeros((len(sentences),len(token_index))) for i inrange(len(sentences)): for word in sentences[i]: results[i,token_index[word]]+=1 return results
from sklearn.feature_extraction.text import TfidfVectorizer tf_idf_vec=TfidfVectorizer(stop_words=nltk.corpus.stopwords.words('english'),).fit_transform(raw_sentences)
Word Embedding的原理是用一个低维的稠密向量表示一个词语,这个词语可以表示一个书名、一个商品、或是一部电影等等。这个向量的性质是能够使得距离相近的向量所对应的词语具有相近的含义。例如,“复仇者联盟”和“钢铁侠”之间的距离比较接近,而“复仇者联盟”和“乱世佳人”之间的距离就会相应的远一些。此外,embedding甚至还可以具有一些属性运算的关系。例如\(\text{Vec(Woman) }- \text{Vec(Man)}\approx \text{Vec(Queen)}-\text{Vec(King)}\)。
CBOW模型指的是使用一个词语的上下文作为输入,来预测这个词语本身。例如我们要使用“The quick brown fox jumps over the lazy dog”这一句话来训练得到单词"fox"的词嵌入,以“fox”前后的各3个单词作为输入,则模型的输入则为“The, quick, brown, jumps, over, the”这六个词。模型结构如下图,其中相邻的两层之间都为全连接层:
Skip-Gram模型指的是用一个词语作为输入,来预测它周围的上下文。例如我们要使用“The quick brown fox jumps over the lazy dog”这一句话来训练得到单词"fox"的词嵌入。模型的输入为单词“fox”,而模型的输出则为“fox”前后的各3个单词,即“The, quick, brown, jumps, over, the”这六个词。模型结构如下图所示: