캐글 SMS Spam Collection Dataset 2탄

데이터 받기

df = pd.read_csv("/kaggle/input/sms-spam-collection-dataset/spam.csv", encoding="latin-1")

df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']

df.head()

메시지 길어 컬럼 만들기

df['message_len'] = df['message'].apply(lambda x: len(x.split(' ')))
df.head()

agg를 통한 통계 기법

balance_counts = df.groupby('target')['target'].agg('count').values
balance_counts

스팸빈도 시각화

fig = go.Figure()
fig.add_trace(go.Bar(
 x=['ham'],
 y=[balance_counts[0]],
 name='ham',
 text=[balance_counts[0]],
 textposition='auto',
 marker_color=primary_blue
))
fig.add_trace(go.Bar(
 x=['spam'],
 y=[balance_counts[1]],
 name='spam',
 text=[balance_counts[1]],
 textposition='auto',
 marker_color=primary_grey
))
fig.update_layout(
 title='Dataset distribution by target'
)
fig.show()

문자길이에따른 스팸여부 시각화

ham_df = df[df['target'] == 'ham']['message_len'].value_counts().sort_index()
spam_df = df[df['target'] == 'spam']['message_len'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Scatter(
 x=ham_df.index,
 y=ham_df.values,
 name='ham',
 fill='tozeroy',
 marker_color=primary_blue,
))
fig.add_trace(go.Scatter(
 x=spam_df.index,
 y=spam_df.values,
 name='spam',
 fill='tozeroy',
 marker_color=primary_grey,
))
fig.update_layout(
 title='Data Roles in Different Fields'
)
fig.update_xaxes(range=[0, 70])
fig.show()

정규표현식을 통한 특수문자 제거 구현

def clean_text(text):
 '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
 and remove words containing numbers.'''
 text = str(text).lower()
 text = re.sub('\[.*?\]', '', text)
 text = re.sub('https?://\S+|www\.\S+', '', text)
 text = re.sub('<.*?>+', '', text)
 text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
 text = re.sub('\n', '', text)
 text = re.sub('\w*\d\w*', '', text)
 return text

df['message_clean'] = df['message'].apply(clean_text)
df.head()

불용어 제거

stop_words = stopwords.words('english')
more_stopwords = ['u', 'im', 'c']
stop_words = stop_words + more_stopwords

def remove_stopwords(text):
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    return text

df['message_clean'] = df['message_clean'].apply(remove_stopwords)
df.head()

형태소 분석

stemmer = nltk.SnowballStemmer("english")

def stemm_text(text):
text = ' '.join(stemmer.stem(word) for word in text.split(' '))
return text

df['message_clean'] = df['message_clean'].apply(stemm_text)
df.head()

위과정 요약 코드

def preprocess_data(text):
    # Clean puntuation, urls, and so on
    text = clean_text(text)
    # Remove stopwords
    text = ' '.join(word for word in text.split(' ') if word not in stop_words)
    # Stemm all the words in the sentence
    text = ' '.join(stemmer.stem(word) for word in text.split(' '))

    return text

df['message_clean'] = df['message_clean'].apply(preprocess_data)
df.head()

타겟을 0또는 1이라는 숫자로 바꾸기

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df['target'])

df['target_encoded'] = le.transform(df['target'])
df.head()

ham인 경우 시각화

twitter_mask = np.array(Image.open('/kaggle/input/masksforwordclouds/twitter_mask3.jpg'))

wc = WordCloud(
    background_color='white',
    max_words=200,
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'ham', 'message_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for HAM messages',
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

spam인 경우 시각화

twitter_mask = np.array(Image.open('/kaggle/input/masksforwordclouds/twitter_mask3.jpg'))

wc = WordCloud(
    background_color='white',
    max_words=200,
    mask=twitter_mask,
)
wc.generate(' '.join(text for text in df.loc[df['target'] == 'spam', 'message_clean']))
plt.figure(figsize=(18,10))
plt.title('Top words for SPAM messages',
          fontdict={'size': 22,  'verticalalignment': 'bottom'})
plt.imshow(wc)
plt.axis("off")
plt.show()

테스트 데이터와 학습데이터로 나누기

x = df['message_clean']
y = df['target_encoded']

print(len(x), len(y))

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print(len(x_train), len(y_train))
print(len(x_test), len(y_test))

tf-idfcounter 적용

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(x_train)

x_train_dtm = vect.transform(x_train)
x_test_dtm = vect.transform(x_test)

tf-idf counter 선언

# 추출할 최대 상위 100개로 제한함

#어휘에 포함될 최소 용어는 제한 최대 70%까지 용어 제한하겠음

# 최소 10%까지만 제한 하겠다는 의미

즉 10~70%사이만 제외하겠다는 의미임

# ngram_range=(1,2)은 단어묶음을 하나또는 두개로 하라는 의미임

vect_tunned = CountVectorizer(stop_words='english', ngram_range=(1,2), min_df=0.1, max_df=0.7, max_features=100)
vect_tunned

예시 그림

tf-idf vector적용하기

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

tfidf_transformer.fit(x_train_dtm)
x_train_tfidf = tfidf_transformer.transform(x_train_dtm)

x_train_tfidf

texts = df['message_clean']
target = df['target_encoded']

토크나이저 적용하기

word_tokenizer = Tokenizer()

#텍스트 추가하기

#dict형태로 이루어질예정임
word_tokenizer.fit_on_texts(texts)

#처음인덱스는 0으로 시작하기에 +1추가하기

#word_index는 dict사전임
vocab_length = len(word_tokenizer.word_index) + 1
vocab_length

print(type(word_tokenizer.word_index))
word_tokenizer.word_index

임베딩하기

def embed(corpus):
    return word_tokenizer.texts_to_sequences(corpus)

longest_train = max(texts, key=lambda sentence: len(word_tokenize(sentence)))
length_long_sentence = len(word_tokenize(longest_train))

train_padded_sentences = pad_sequences(
    embed(texts),
    length_long_sentence,
    padding='post'
)

train_padded_sentences

dict안에 값을 비교하여 원하는 값이 있다면 출력하기

embeddings_dictionary = dict()
embedding_dim = 100

with open('/kaggle/input/glove6b100dtxt/glove.6B.100d.txt') as fp:
    for line in fp.readlines():
        records = line.split()
        word = records[0]
        vector_dimensions = np.asarray(records[1:], dtype='float32')
        embeddings_dictionary [word] = vector_dimensions

embedding_matrix = np.zeros((vocab_length, embedding_dim))

for word, index in word_tokenizer.word_index.items():
    embedding_vector = embeddings_dictionary.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

embedding_matrix

혼돈행렬 시각화 구현코드

import plotly.figure_factory as ff

x_axes = ['Ham', 'Spam']
y_axes = ['Spam', 'Ham']

def conf_matrix(z, x=x_axes, y=y_axes):
 #값 거꾸로 뒤집기
 #0이기에 수직임
 z = np.flip(z, 0)


 z_text = [[str(y) for y in x] for x in z]

 # 시각화
 # x열
 # y행
 # annotation_text는 텍스트
 # colorscale는 색깔
 fig = ff.create_annotated_heatmap(z, x=x, y=y, annotation_text=z_text, colorscale='Viridis')

 # add title
 fig.update_layout(title_text='Confusion matrix',
 xaxis = dict(title='Predicted value'),
 yaxis = dict(title='Real value')
 )

 # add colorbar
 fig['data'][0]['showscale'] = True

 return fig

나이즈 베이 모델로 학습진행

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

# Train the model
nb.fit(x_train_dtm, y_train)

예측 출력

y_pred_class = nb.predict(x_test_dtm)
y_pred_prob = nb.predict_proba(x_test_dtm)[:, 1]

예측 혼돈행렬로 출력

from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred_class))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))

정확도 출력

metrics.roc_auc_score(y_test, y_pred_prob)

나이브 베이 파이프라인 정확도 출력

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipe = Pipeline([('bow', CountVectorizer()),
('tfid', TfidfTransformer()),
('model', MultinomialNB())])

pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)

print(metrics.accuracy_score(y_test, y_pred_class))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))

X-bost 파이프라인 정확도 출력

import xgboost as xgb

pipe = Pipeline([
    ('bow', CountVectorizer()),
    ('tfid', TfidfTransformer()),
    ('model', xgb.XGBClassifier(
        learning_rate=0.1,
        max_depth=7,
        n_estimators=80,
        use_label_encoder=False,
        eval_metric='auc',
        # colsample_bytree=0.8,
        # subsample=0.7,
        # min_child_weight=5,
    ))
])

pipe.fit(x_train, y_train)

y_pred_class = pipe.predict(x_test)
y_pred_train = pipe.predict(x_train)

print('Train: {}'.format(metrics.accuracy_score(y_train, y_pred_train)))
print('Test: {}'.format(metrics.accuracy_score(y_test, y_pred_class)))

conf_matrix(metrics.confusion_matrix(y_test, y_pred_class))

lsmt 모델 요약

X_train, X_test, y_train, y_test = train_test_split(
    train_padded_sentences,
    target,
    test_size=0.25
)

def glove_lstm():
    model = Sequential()

    model.add(Embedding(
        input_dim=embedding_matrix.shape[0],
        output_dim=embedding_matrix.shape[1],
        weights = [embedding_matrix],
        input_length=length_long_sentence
    ))

    model.add(Bidirectional(LSTM(
        length_long_sentence,
        return_sequences = True,
        recurrent_dropout=0.2
    )))

    model.add(GlobalMaxPool1D())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(length_long_sentence, activation = "relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

    return model

model = glove_lstm()
model.summary()

lsmt 모델 학습

model = glove_lstm()

checkpoint = ModelCheckpoint(
    'model.h5',
    monitor = 'val_loss',
    verbose = 1,
    save_best_only = True
)
reduce_lr = ReduceLROnPlateau(
    monitor = 'val_loss',
    factor = 0.2,
    verbose = 1,
    patience = 5,
    min_lr = 0.001
)
history = model.fit(
    X_train,
    y_train,
    epochs = 7,
    batch_size = 32,
    validation_data = (X_test, y_test),
    verbose = 1,
    callbacks = [reduce_lr, checkpoint]
)

시각화 함수 구현

def plot_learning_curves(history, arr):
    fig, ax = plt.subplots(1, 2, figsize=(20, 5))
    for idx in range(2):
        ax[idx].plot(history.history[arr[idx][0]])
        ax[idx].plot(history.history[arr[idx][1]])
        ax[idx].legend([arr[idx][0], arr[idx][1]],fontsize=18)
        ax[idx].set_xlabel('A ',fontsize=16)
        ax[idx].set_ylabel('B',fontsize=16)
        ax[idx].set_title(arr[idx][0] + ' X ' + arr[idx][1],fontsize=16)

plot_learning_curves(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

혼돈 행렬 출력

y_preds = (model.predict(X_test) > 0.5).astype("int32")
conf_matrix(metrics.confusion_matrix(y_test, y_preds))

bert 사용하기

!pip install transformers

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint

import transformers
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer

try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)

except:
    strategy = tf.distribute.get_strategy()

print('Number of replicas in sync: ', strategy.num_replicas_in_sync)

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

def bert_encode(data, maximum_length) :
    input_ids = []
    attention_masks = []

    for text in data:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=maximum_length,
            pad_to_max_length=True,

            return_attention_mask=True,
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    return np.array(input_ids),np.array(attention_masks)

texts = df['message_clean']
target = df['target_encoded']

train_input_ids, train_attention_masks = bert_encode(texts,60)

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

def create_model(bert_model):

    input_ids = tf.keras.Input(shape=(60,),dtype='int32')
    attention_masks = tf.keras.Input(shape=(60,),dtype='int32')

    output = bert_model([input_ids,attention_masks])
    output = output[1]
    output = tf.keras.layers.Dense(32,activation='relu')(output)
    output = tf.keras.layers.Dropout(0.2)(output)
    output = tf.keras.layers.Dense(1,activation='sigmoid')(output)

    model = tf.keras.models.Model(inputs = [input_ids,attention_masks],outputs = output)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    return model

from transformers import TFBertModel
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

model = create_model(bert_model)
model.summary()

history = model.fit(
    [train_input_ids, train_attention_masks],
    target,
    validation_split=0.2,
    epochs=3,
    batch_size=10
)

plot_learning_curves(history, [['loss', 'val_loss'],['accuracy', 'val_accuracy']])

'캐글 코드' 카테고리의 다른 글

캐글 우주 타이타닉 예측 1탄 (0)	2024.02.23
캐글 자전거 수요예측 (0)	2024.02.22
캐글 SMS Spam Collection Dataset 1탄 (0)	2024.01.31
캐글 News Detection 1탄 (1)	2024.01.30
캐글 NLP 뉴스데이터 분석 1탄 (0)	2024.01.29

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

백준파이썬개발자:프로젝트골드

캐글 SMS Spam Collection Dataset 2탄

데이터 받기

메시지 길어 컬럼 만들기

agg를 통한 통계 기법

스팸빈도 시각화

정규표현식을 통한 특수문자 제거 구현

불용어 제거

형태소 분석

위과정 요약 코드

타겟을 0또는 1이라는 숫자로 바꾸기

ham인 경우 시각화

spam인 경우 시각화

테스트 데이터와 학습데이터로 나누기

tf-idfcounter 적용

tf-idf counter 선언

tf-idf vector적용하기

토크나이저 적용하기

임베딩하기

dict안에 값을 비교하여 원하는 값이 있다면 출력하기

혼돈행렬 시각화 구현코드

나이즈 베이 모델로 학습진행

예측 출력

예측 혼돈행렬로 출력

정확도 출력

나이브 베이 파이프라인 정확도 출력

X-bost 파이프라인 정확도 출력

lsmt 모델 학습

시각화 함수 구현

혼돈 행렬 출력

bert 사용하기

'캐글 코드' 카테고리의 다른 글

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역

캐글 SMS Spam Collection Dataset 2탄

데이터 받기

메시지 길어 컬럼 만들기

agg를 통한 통계 기법

스팸빈도 시각화

정규표현식을 통한 특수문자 제거 구현

불용어 제거

형태소 분석

위과정 요약 코드

타겟을 0또는 1이라는 숫자로 바꾸기

ham인 경우 시각화

spam인 경우 시각화

테스트 데이터와 학습데이터로 나누기

tf-idfcounter 적용

tf-idf counter 선언

tf-idf vector적용하기

토크나이저 적용하기

임베딩하기

dict안에 값을 비교하여 원하는 값이 있다면 출력하기

혼돈행렬 시각화 구현코드

나이즈 베이 모델로 학습진행

예측 출력

예측 혼돈행렬로 출력

정확도 출력

나이브 베이 파이프라인 정확도 출력

X-bost 파이프라인 정확도 출력

lsmt 모델 학습

시각화 함수 구현

혼돈 행렬 출력

bert 사용하기

'캐글 코드' 카테고리의 다른 글

'캐글 코드' Related Articles

티스토리툴바

단축키

내 블로그

블로그 게시글

모든 영역