nltk_spam_dtr
Typer | Posted on | |
# ------------------------------
# 安装依赖库(如果未安装)
# ------------------------------
pip install textblob # 建议在终端运行,Jupyter 中加 !pip install 更稳妥
# ------------------------------
# 导入必要库
# ------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import sklearn
import nltk
# nltk.download('popular') # 一次性下载大量资源(可选)
nltk.download('punkt') # 用于分词
nltk.download('punkt_tab') # punkt 拓展资源
# nltk.download('averaged_perceptron_tagger') # 词性标注器(可选)
nltk.download('wordnet') # WordNet 词典资源(词形还原用)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
# ------------------------------
# 引用自定义模块(⚠️需 defs.py 文件存在)
# ------------------------------
from defs import get_tokens
from defs import get_lemmas
# ------------------------------
# 加载短信数据集(无表头)
# ------------------------------
sms = pd.read_csv('/.../sms_spam_no_header.csv', sep=',', names=["type", "text"])
# ------------------------------
# 划分训练集与测试集(30% 测试)
# ------------------------------
text_train, text_test, type_train, type_test = train_test_split(
sms['text'], sms['type'], test_size=0.3
)
# ------------------------------
# 提取文本特征(词袋模型 + 词形还原器)
# ------------------------------
bow = CountVectorizer(analyzer=get_lemmas).fit(text_train) # 使用词形还原作为分词器
sms_bow = bow.transform(text_train)
# ------------------------------
# TF-IDF 加权转换
# ------------------------------
tfidf = TfidfTransformer().fit(sms_bow)
sms_tfidf = tfidf.transform(sms_bow)
# ------------------------------
# 训练 Naive Bayes 模型(朴素贝叶斯)
# ------------------------------
spam_detect_model = MultinomialNB().fit(sms_tfidf, type_train)
# ------------------------------
# 在测试集上进行预测并评估
# ------------------------------
sms_test_bow = bow.transform(text_test)
sms_test_tfidf = tfidf.transform(sms_test_bow)
predictions = spam_detect_model.predict(sms_test_tfidf)
print(classification_report(type_test, predictions))
print(f"Accuracy: {accuracy_score(type_test, predictions):.2f}")