캐글 SMS Spam Collection Dataset 1탄
라이브러리 임포트
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")
데이터 가져오기
simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']
TF-IDF 적용
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
# tf-idf 적용하기
vect.fit(simple_train)
# 단어 가져오기
vect.get_feature_names_out()
단어의 빈도수 많큼 출력
simple_train_dtm = vect.transform(simple_train)
simple_train_dtm.toarray()
데이터 프레임으로 출력
pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names_out())
# check the type of the document-term matrix
print(type(simple_train_dtm))
# examine the sparse matrix contents
print(simple_train_dtm)
테스트 데이터 만들기
simple_test = ["please don't call me"]
캐글의 스팸데이터 받기
sms = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']
sms.head()

데이터 갯수 출력하기
#count는 총값의 갯수
#unique는 null값의 갯수
#top은 가장 빈도가 많은 값
#freq는 가장빈도가 많은 값의 수
sms.describe()

#스팸이냐 일반메일인지 구분하기
#freq각 값의 최빈값을 가지는 값의 갯수
sms.groupby('label').describe()

한국어를 숫자로 변경
# 인공지능에 학습할때에 한국어보다 숫자가 좋음
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

메세지의 길이라는 칼럼 출가
sms['message_len'] = sms.message.apply(len)
sms.head()

스팸인지에 따른 메시지 길이 출력
plt.figure(figsize=(12, 8))
sms[sms.label=='ham'].message_len.plot(bins=35, kind='hist', color='blue',
label='Ham messages', alpha=0.6)
sms[sms.label=='spam'].message_len.plot(kind='hist', color='red',
label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")

스팸인지에 따른 문자길이
sms[sms.label=='ham'].describe()

sms[sms.label=='spam'].describe()

길이가 910인 메시지 출력
sms[sms.message_len == 910].message.iloc[0]

import string
from nltk.corpus import stopwords
def text_process(mess):
"""
Takes in a string of text, then performs the following:
1. Remove all punctuation
2. Remove all stopwords
3. Returns a list of the cleaned text
"""
#불용어 지정
STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
# string.punctuation은 특수문자를 의미함
nopunc = [char for char in mess if char not in string.punctuation]
nopunc = ''.join(nopunc)
#word.lower()은 소문자로 값을 낮추었을 때임
return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])
불용어 제거 적용
sms['clean_msg'] = sms.message.apply(text_process)
어떠한 단어가 가장 많이 들어 갔는 지 빈도 확인
from collections import Counter
words = sms[sms.label=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
ham_words = Counter()
for msg in words:
ham_words.update(msg)
print(ham_words.most_common(50))
테스트 데이터 학습데이터 나누기
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.clean_msg
y = sms.label_num
print(X.shape)
print(y.shape)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
tf-idf 적용하기
from sklearn.feature_extraction.text import CountVectorizer
# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)
# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)
# examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
print(type(X_test_dtm), X_test_dtm.shape)
tf-idf 벡터라이저
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
#정확도 출력
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
%time nb.fit(X_train_dtm, y_train)
from sklearn import metrics
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
#정확도
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))
#혼돈 행렬
print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred_class)
스팸이 아닌데 스팸이라고 뜬경우
# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]
스팸인데 정상이라고 뜬경우
X_test[y_pred_class < y_test]
# 반대의 경우
X_test[y_pred_class < y_test]
#예측되는 확률 출력
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
#정확도 계산
metrics.roc_auc_score(y_test, y_pred_prob)

모델 예측후 정확도 측정
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
pipe = Pipeline([('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', MultinomialNB())])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred))
# print the confusion matrix
print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred)
logistic regression model모델로 예측하기
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(solver='liblinear')
#시간측정해주기
%time logreg.fit(X_train_dtm, y_train)
y_pred_class = logreg.predict(X_test_dtm)
y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob
#정확도 측정하기print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))
# print the confusion matrix
print("=======Confision Matrix===========")
print(metrics.confusion_matrix(y_test, y_pred_class))
# calculate AUC
print("=======ROC AUC Score===========")
print(metrics.roc_auc_score(y_test, y_pred_prob))