hyeori

0215 분석기록 : 불법주정차 신고현황(22.11월~23.10월).csv 본문

Project

0215 분석기록 : 불법주정차 신고현황(22.11월~23.10월).csv

혜오리이 2024. 2. 15. 14:13

1) 서울 불법 주정차 신고 현황 시각화

2) k= 24 일 때

3) K-means clustering 후 , k = 19 인 경우 시각

import matplotlib.pyplot as plt

# 클러스터 ID가 19인 데이터 필터링
cluster_19 = seoul_data[seoul_data['cluster_id'] == 19]

# 클러스터 ID가 19인 데이터의 중심 위치
cluster_19_center = kmeans.cluster_centers_[19]

# 클러스터 시각화
plt.scatter(cluster_19['경도'], cluster_19['위도'], c='blue', s=10, alpha=0.5)
plt.scatter(cluster_19_center[0], cluster_19_center[1], c='red', s=20, alpha=0.75, marker='X')

# 그래프 제목 및 축 레이블 설정
plt.title('K-Means 클러스터링 결과 시각화 (cluster_ID 19)')
plt.xlabel('경도')
plt.ylabel('위도')

# 그래프 표시
plt.show()

4) 마포구 데이터 추출

seoul = data[(data['위도'] >= 37.4) & (data['위도'] <= 37.7)]
seoul = seoul[(seoul['경도'] >= 126.7) & (seoul['경도'] <= 127.2)]

mapo = ['마포', '공덕', '대흥', '도화', '망원', '상암동', '서강', '서교', '성산', '신수', '아현', '연남', '염리', '용강', '합정']
mapo_df = seoul[seoul['주소'].str.contains('|'.join(mapo), na=False)].reset_index(drop=True)


mapo_df = mapo_df[(mapo_df['위도'] >= 37.53) & (mapo_df['위도'] <= 37.59)]
mapo_df = mapo_df[(mapo_df['경도'] >= 126.87) & (mapo_df['경도'] <= 126.97)]

import matplotlib.pyplot as plt
plt.scatter(mapo_df['경도'],mapo_df['위도'], c='blue', s=15, alpha=1,edgecolors='skyblue',linewidths=0.5)
plt.title('마포')
plt.xlabel('경도')
plt.ylabel('위도')

plt.show()

5)  LSTM 시도 - 실패코드 

import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

# 데이터를 학습용과 테스트용으로 분리
train_size = int(len(mapo_df) * 0.8)
train, test = mapo_df.iloc[0:train_size, :], mapo_df.iloc[train_size:, :]

# 데이터셋 생성 함수 정의
def create_dataset(dataset, time_steps=1):
    dataX, dataY = [], []
    for i in range(len(dataset)-time_steps-1):
        a = dataset.iloc[i:(i+time_steps), 1]
        dataX.append(a.values)  # .values를 추가
        dataY.append(dataset.iloc[i + time_steps, 1])
    return np.array(dataX), np.array(dataY)

# 데이터셋 생성
X_train, y_train = create_dataset(train, time_steps)
X_test, y_test = create_dataset(test, time_steps)

# LSTM 모델 구축
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')

model.summary()

# specify your learning rate
learning_rate = 0.01
# create an Adam optimizer with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)
# compile your model using the custom optimizer
model.compile(optimizer=optimizer, loss='mse')

# Try to load weights
try:
    model.load_weights('/content/drive/MyDrive/Colab Notebooks/2024 DF 겨울시즌')
    print("Loaded model weights from disk")
except:
    print("No weights found, training model from scratch")
    
    history = model.fit(X_train, y_train, epochs=20, batch_size=16,validation_split=0.1, verbose=2)
    
    # Save model weights after training
model.save_weights('/content/drive/MyDrive/Colab Notebooks/2024 DF 겨울시즌')

plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
plt.show()

# 모델 훈련 후에 예측 생성
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)


# 평가 지표 계산
train_score = np.sqrt(mean_squared_error(y_train, train_predict[:, 0]))
print('훈련 RMSE: %.2f' % (train_score))
test_score = np.sqrt(mean_squared_error(y_test, test_predict[:, 0]))
print('테스트 RMSE: %.2f' % (test_score))

 

6) 특이점에 대해 살펴보자

1. 요일별

# @title 요일

from matplotlib import pyplot as plt
import seaborn as sns
mapo_df.groupby('요일').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)

2. timestamp vs 건수

# @title timestamp vs 건수

from matplotlib import pyplot as plt
import seaborn as sns
def _plot_series(series, series_name, series_index=0):
  from matplotlib import pyplot as plt
  import seaborn as sns
  palette = list(sns.palettes.mpl_palette('Dark2'))
  xs = series['timestamp']
  ys = series['건수']

  plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])

fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = mapo_df.sort_values('timestamp', ascending=True)
for i, (series_name, series) in enumerate(df_sorted.groupby('요일')):
  _plot_series(series, series_name, i)
  fig.legend(title='요일', bbox_to_anchor=(1, 1), loc='upper left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('timestamp')
_ = plt.ylabel('건수')