五、文本预处理

作者:Chris Albon

译者:飞龙

协议:CC BY-NC-SA 4.0

词袋

五、文本预处理 - 图1

  1. # 加载库
  2. import numpy as np
  3. from sklearn.feature_extraction.text import CountVectorizer
  4. import pandas as pd
  5. # 创建文本
  6. text_data = np.array(['I love Brazil. Brazil!',
  7. 'Sweden is best',
  8. 'Germany beats both'])
  9. # 创建词袋特征矩阵
  10. count = CountVectorizer()
  11. bag_of_words = count.fit_transform(text_data)
  12. # 展示特征矩阵
  13. bag_of_words.toarray()
  14. '''
  15. array([[0, 0, 0, 2, 0, 0, 1, 0],
  16. [0, 1, 0, 0, 0, 1, 0, 1],
  17. [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64)
  18. '''
  19. # 获取特征名称
  20. feature_names = count.get_feature_names()
  21. # 查看特征名称
  22. feature_names
  23. # ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']
  24. # 创建数据帧
  25. pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
beatsbestbothbrazilgermanyislovesweden
000020010
101000101
210101000

解析 HTML

  1. # 加载库
  2. from bs4 import BeautifulSoup
  3. # 创建一些 HTML 代码
  4. html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"
  5. # 解析 html
  6. soup = BeautifulSoup(html, "lxml")
  7. # 寻找带有 "full_name" 类的 <div>,展示文本
  8. soup.find("div", { "class" : "full_name" }).text
  9. # 'Masego Azra'

移除标点

  1. # 加载库
  2. import string
  3. import numpy as np
  4. # 创建文本
  5. text_data = ['Hi!!!! I. Love. This. Song....',
  6. '10000% Agree!!!! #LoveIT',
  7. 'Right?!?!']
  8. # 创建函数,使用 string.punctuation 移除所有标点
  9. def remove_punctuation(sentence: str) -> str:
  10. return sentence.translate(str.maketrans('', '', string.punctuation))
  11. # 应用函数
  12. [remove_punctuation(sentence) for sentence in text_data]
  13. # ['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

移除停止词

  1. # 加载库
  2. from nltk.corpus import stopwords
  3. # 你第一次需要下载停止词的集合
  4. import nltk
  5. nltk.download('stopwords')
  6. '''
  7. [nltk_data] Downloading package stopwords to
  8. [nltk_data] /Users/chrisalbon/nltk_data...
  9. [nltk_data] Package stopwords is already up-to-date!
  10. True
  11. '''
  12. # 创建单词标记
  13. tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']
  14. # 加载停止词
  15. stop_words = stopwords.words('english')
  16. # 展示停止词
  17. stop_words[:5]
  18. # ['i', 'me', 'my', 'myself', 'we']
  19. # 移除停止词
  20. [word for word in tokenized_words if word not in stop_words]
  21. # ['going', 'go', 'store', 'park']

替换字符

  1. # 导入库
  2. import re
  3. # 创建文本
  4. text_data = ['Interrobang. By Aishwarya Henriette',
  5. 'Parking And Going. By Karl Gautier',
  6. 'Today Is The night. By Jarek Prakash']
  7. # 移除句号
  8. remove_periods = [string.replace('.', '') for string in text_data]
  9. # 展示文本
  10. remove_periods
  11. '''
  12. ['Interrobang By Aishwarya Henriette',
  13. 'Parking And Going By Karl Gautier',
  14. 'Today Is The night By Jarek Prakash']
  15. '''
  16. # 创建函数
  17. def replace_letters_with_X(string: str) -> str:
  18. return re.sub(r'[a-zA-Z]', 'X', string)
  19. # 应用函数
  20. [replace_letters_with_X(string) for string in remove_periods]
  21. '''
  22. ['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
  23. 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
  24. 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']
  25. '''

词干提取

五、文本预处理 - 图2

  1. # 加载库
  2. from nltk.stem.porter import PorterStemmer
  3. # 创建单词标记
  4. tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

词干提取通过识别和删除词缀(例如动名词)同时保持词的根本意义,将词语简化为词干。 NLTK 的PorterStemmer实现了广泛使用的 Porter 词干算法。

  1. # 创建提取器
  2. porter = PorterStemmer()
  3. # 应用提取器
  4. [porter.stem(word) for word in tokenized_words]
  5. # ['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

移除空白

  1. # 创建文本
  2. text_data = [' Interrobang. By Aishwarya Henriette ',
  3. 'Parking And Going. By Karl Gautier',
  4. ' Today Is The night. By Jarek Prakash ']
  5. # 移除空白
  6. strip_whitespace = [string.strip() for string in text_data]
  7. # 展示文本
  8. strip_whitespace
  9. '''
  10. ['Interrobang. By Aishwarya Henriette',
  11. 'Parking And Going. By Karl Gautier',
  12. 'Today Is The night. By Jarek Prakash']
  13. '''

词性标签

  1. # 加载库
  2. from nltk import pos_tag
  3. from nltk import word_tokenize
  4. # 创建文本
  5. text_data = "Chris loved outdoor running"
  6. # 使用预训练的词性标注器
  7. text_tagged = pos_tag(word_tokenize(text_data))
  8. # 展示词性
  9. text_tagged
  10. # [('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]

输出是一个元组列表,包含单词和词性的标记。 NLTK 使用 Penn Treebank 词性标签。

标签词性
NNP专有名词,单数
NN名词,单数或集体
RB副词
VBD动词,过去式
VBG动词,动名词或现在分词
JJ形容词
PRP人称代词

TF-IDF

五、文本预处理 - 图3

  1. # 加载库
  2. import numpy as np
  3. from sklearn.feature_extraction.text import TfidfVectorizer
  4. import pandas as pd
  5. # 创建文本
  6. text_data = np.array(['I love Brazil. Brazil!',
  7. 'Sweden is best',
  8. 'Germany beats both'])
  9. # 创建 tf-idf 特征矩阵
  10. tfidf = TfidfVectorizer()
  11. feature_matrix = tfidf.fit_transform(text_data)
  12. # 展示 tf-idf 特征矩阵
  13. feature_matrix.toarray()
  14. '''
  15. array([[ 0. , 0. , 0. , 0.89442719, 0. ,
  16. 0. , 0.4472136 , 0. ],
  17. [ 0. , 0.57735027, 0. , 0. , 0. ,
  18. 0.57735027, 0. , 0.57735027],
  19. [ 0.57735027, 0. , 0.57735027, 0. , 0.57735027,
  20. 0. , 0. , 0. ]])
  21. '''
  22. # 展示 tf-idf 特征矩阵
  23. tfidf.get_feature_names()
  24. # ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']
  25. # 创建数据帧
  26. pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
beatsbestbothbrazilgermanyislovesweden
00.000000.000000.000000.8944270.000000.000000.4472140.00000
10.000000.577350.000000.0000000.000000.577350.0000000.57735
20.577350.000000.577350.0000000.577350.000000.0000000.00000

文本分词

  1. # 加载库
  2. from nltk.tokenize import word_tokenize, sent_tokenize
  3. # 创建文本
  4. string = "The science of today is the technology of tomorrow. Tomorrow is today."
  5. # 对文本分词
  6. word_tokenize(string)
  7. '''
  8. ['The',
  9. 'science',
  10. 'of',
  11. 'today',
  12. 'is',
  13. 'the',
  14. 'technology',
  15. 'of',
  16. 'tomorrow',
  17. '.',
  18. 'Tomorrow',
  19. 'is',
  20. 'today',
  21. '.']
  22. '''
  23. # 对句子分词
  24. sent_tokenize(string)
  25. # ['The science of today is the technology of tomorrow.', 'Tomorrow is today.']