캐글 SMS Spam Collection Dataset 1탄

캐글 코드

캐글 SMS Spam Collection Dataset 1탄

백준파이썬개발자:프로젝트골드 2024. 1. 31. 13:12

라이브러리 임포트

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

데이터 가져오기

simple_train = ['call you tonight', 'Call me a cab', 'Please call me... PLEASE!']

TF-IDF 적용

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()

# tf-idf 적용하기
vect.fit(simple_train)

# 단어 가져오기
vect.get_feature_names_out()

단어의 빈도수 많큼 출력

simple_train_dtm = vect.transform(simple_train)
simple_train_dtm.toarray()

데이터 프레임으로 출력

pd.DataFrame(simple_train_dtm.toarray(), columns=vect.get_feature_names_out())

# check the type of the document-term matrix
print(type(simple_train_dtm))

# examine the sparse matrix contents
print(simple_train_dtm)

테스트 데이터 만들기

simple_test = ["please don't call me"]

캐글의 스팸데이터 받기

sms = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding='latin-1')
sms.dropna(how="any", inplace=True, axis=1)
sms.columns = ['label', 'message']
sms.head()

데이터 갯수 출력하기

#count는 총값의 갯수

#unique는 null값의 갯수

#top은 가장 빈도가 많은 값

#freq는 가장빈도가 많은 값의 수

sms.describe()

#스팸이냐 일반메일인지 구분하기

#freq각 값의 최빈값을 가지는 값의 갯수

sms.groupby('label').describe()

한국어를 숫자로 변경

# 인공지능에 학습할때에 한국어보다 숫자가 좋음
sms['label_num'] = sms.label.map({'ham':0, 'spam':1})
sms.head()

메세지의 길이라는 칼럼 출가

sms['message_len'] = sms.message.apply(len)
sms.head()

스팸인지에 따른 메시지 길이 출력

plt.figure(figsize=(12, 8))

sms[sms.label=='ham'].message_len.plot(bins=35, kind='hist', color='blue',
label='Ham messages', alpha=0.6)
sms[sms.label=='spam'].message_len.plot(kind='hist', color='red',
label='Spam messages', alpha=0.6)
plt.legend()
plt.xlabel("Message Length")

스팸인지에 따른 문자길이

sms[sms.label=='ham'].describe()

sms[sms.label=='spam'].describe()

길이가 910인 메시지 출력

sms[sms.message_len == 910].message.iloc[0]

import string
from nltk.corpus import stopwords

def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """

#불용어 지정
    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
# string.punctuation은 특수문자를 의미함
    nopunc = [char for char in mess if char not in string.punctuation]

    nopunc = ''.join(nopunc)

#word.lower()은 소문자로 값을 낮추었을 때임
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

불용어 제거 적용

sms['clean_msg'] = sms.message.apply(text_process)

어떠한 단어가 가장 많이 들어 갔는 지 빈도 확인

from collections import Counter

words = sms[sms.label=='ham'].clean_msg.apply(lambda x: [word.lower() for word in x.split()])
ham_words = Counter()

for msg in words:
ham_words.update(msg)

print(ham_words.most_common(50))

테스트 데이터 학습데이터 나누기

# split X and y into training and testing sets
from sklearn.model_selection import train_test_split

# how to define X and y (from the SMS data) for use with COUNTVECTORIZER
X = sms.clean_msg
y = sms.label_num
print(X.shape)
print(y.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

tf-idf 적용하기

from sklearn.feature_extraction.text import CountVectorizer

# instantiate the vectorizer
vect = CountVectorizer()
vect.fit(X_train)

# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)

# equivalently: combine fit and transform into a single step
X_train_dtm = vect.fit_transform(X_train)

# examine the document-term matrix
print(type(X_train_dtm), X_train_dtm.shape)

# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
print(type(X_test_dtm), X_test_dtm.shape)

tf-idf 벡터라이저

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_dtm)
tfidf_transformer.transform(X_train_dtm)

# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

#정확도 출력

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

%time nb.fit(X_train_dtm, y_train)

from sklearn import metrics

# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)

#정확도
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))

#혼돈 행렬

print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred_class)

스팸이 아닌데 스팸이라고 뜬경우

# X_test[(y_pred_class==1) & (y_test==0)]
X_test[y_pred_class > y_test]

스팸인데 정상이라고 뜬경우

X_test[y_pred_class < y_test]

# 반대의 경우

X_test[y_pred_class < y_test]

#예측되는 확률 출력
y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

#정확도 계산

metrics.roc_auc_score(y_test, y_pred_prob)

모델 예측후 정확도 측정

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', MultinomialNB())])

pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

# calculate accuracy of class predictions
print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred))

# print the confusion matrix
print("=======Confision Matrix===========")
metrics.confusion_matrix(y_test, y_pred)

logistic regression model모델로 예측하기

from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(solver='liblinear')

#시간측정해주기
%time logreg.fit(X_train_dtm, y_train)

y_pred_class = logreg.predict(X_test_dtm)

y_pred_prob = logreg.predict_proba(X_test_dtm)[:, 1]
y_pred_prob

#정확도 측정하기print("=======Accuracy Score===========")
print(metrics.accuracy_score(y_test, y_pred_class))

# print the confusion matrix
print("=======Confision Matrix===========")
print(metrics.confusion_matrix(y_test, y_pred_class))

# calculate AUC
print("=======ROC AUC Score===========")
print(metrics.roc_auc_score(y_test, y_pred_prob))