nltk_spam_dtr

# ------------------------------
# 安装依赖库(如果未安装)
# ------------------------------
pip install textblob  # 建议在终端运行,Jupyter 中加 !pip install 更稳妥

# ------------------------------
# 导入必要库
# ------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import sklearn

import nltk
# nltk.download('popular')         # 一次性下载大量资源(可选)
nltk.download('punkt')            # 用于分词
nltk.download('punkt_tab')        # punkt 拓展资源
# nltk.download('averaged_perceptron_tagger')  # 词性标注器(可选)
nltk.download('wordnet')          # WordNet 词典资源(词形还原用)

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# ------------------------------
# 引用自定义模块(⚠️需 defs.py 文件存在)
# ------------------------------
from defs import get_tokens
from defs import get_lemmas

# ------------------------------
# 加载短信数据集(无表头)
# ------------------------------
sms = pd.read_csv('/.../sms_spam_no_header.csv', sep=',', names=["type", "text"])

# ------------------------------
# 划分训练集与测试集(30% 测试)
# ------------------------------
text_train, text_test, type_train, type_test = train_test_split(
    sms['text'], sms['type'], test_size=0.3
)

# ------------------------------
# 提取文本特征(词袋模型 + 词形还原器)
# ------------------------------
bow = CountVectorizer(analyzer=get_lemmas).fit(text_train)  # 使用词形还原作为分词器
sms_bow = bow.transform(text_train)

# ------------------------------
# TF-IDF 加权转换
# ------------------------------
tfidf = TfidfTransformer().fit(sms_bow)
sms_tfidf = tfidf.transform(sms_bow)

# ------------------------------
# 训练 Naive Bayes 模型(朴素贝叶斯)
# ------------------------------
spam_detect_model = MultinomialNB().fit(sms_tfidf, type_train)

# ------------------------------
# 在测试集上进行预测并评估
# ------------------------------
sms_test_bow = bow.transform(text_test)
sms_test_tfidf = tfidf.transform(sms_test_bow)

predictions = spam_detect_model.predict(sms_test_tfidf)

print(classification_report(type_test, predictions))
print(f"Accuracy: {accuracy_score(type_test, predictions):.2f}")

 

Scroll to Top