hyeori
0215 분석기록 : 불법주정차 신고현황(22.11월~23.10월).csv 본문
1) 서울 불법 주정차 신고 현황 시각화
2) k= 24 일 때
3) K-means clustering 후 , k = 19 인 경우 시각
import matplotlib.pyplot as plt
# 클러스터 ID가 19인 데이터 필터링
cluster_19 = seoul_data[seoul_data['cluster_id'] == 19]
# 클러스터 ID가 19인 데이터의 중심 위치
cluster_19_center = kmeans.cluster_centers_[19]
# 클러스터 시각화
plt.scatter(cluster_19['경도'], cluster_19['위도'], c='blue', s=10, alpha=0.5)
plt.scatter(cluster_19_center[0], cluster_19_center[1], c='red', s=20, alpha=0.75, marker='X')
# 그래프 제목 및 축 레이블 설정
plt.title('K-Means 클러스터링 결과 시각화 (cluster_ID 19)')
plt.xlabel('경도')
plt.ylabel('위도')
# 그래프 표시
plt.show()
4) 마포구 데이터 추출
seoul = data[(data['위도'] >= 37.4) & (data['위도'] <= 37.7)]
seoul = seoul[(seoul['경도'] >= 126.7) & (seoul['경도'] <= 127.2)]
mapo = ['마포', '공덕', '대흥', '도화', '망원', '상암동', '서강', '서교', '성산', '신수', '아현', '연남', '염리', '용강', '합정']
mapo_df = seoul[seoul['주소'].str.contains('|'.join(mapo), na=False)].reset_index(drop=True)
mapo_df = mapo_df[(mapo_df['위도'] >= 37.53) & (mapo_df['위도'] <= 37.59)]
mapo_df = mapo_df[(mapo_df['경도'] >= 126.87) & (mapo_df['경도'] <= 126.97)]
import matplotlib.pyplot as plt
plt.scatter(mapo_df['경도'],mapo_df['위도'], c='blue', s=15, alpha=1,edgecolors='skyblue',linewidths=0.5)
plt.title('마포')
plt.xlabel('경도')
plt.ylabel('위도')
plt.show()
5) LSTM 시도 - 실패코드
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
# 데이터를 학습용과 테스트용으로 분리
train_size = int(len(mapo_df) * 0.8)
train, test = mapo_df.iloc[0:train_size, :], mapo_df.iloc[train_size:, :]
# 데이터셋 생성 함수 정의
def create_dataset(dataset, time_steps=1):
dataX, dataY = [], []
for i in range(len(dataset)-time_steps-1):
a = dataset.iloc[i:(i+time_steps), 1]
dataX.append(a.values) # .values를 추가
dataY.append(dataset.iloc[i + time_steps, 1])
return np.array(dataX), np.array(dataY)
# 데이터셋 생성
X_train, y_train = create_dataset(train, time_steps)
X_test, y_test = create_dataset(test, time_steps)
# LSTM 모델 구축
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
model.add(LSTM(units=50))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()
# specify your learning rate
learning_rate = 0.01
# create an Adam optimizer with the specified learning rate
optimizer = Adam(learning_rate=learning_rate)
# compile your model using the custom optimizer
model.compile(optimizer=optimizer, loss='mse')
# Try to load weights
try:
model.load_weights('/content/drive/MyDrive/Colab Notebooks/2024 DF 겨울시즌')
print("Loaded model weights from disk")
except:
print("No weights found, training model from scratch")
history = model.fit(X_train, y_train, epochs=20, batch_size=16,validation_split=0.1, verbose=2)
# Save model weights after training
model.save_weights('/content/drive/MyDrive/Colab Notebooks/2024 DF 겨울시즌')
plt.plot(history.history['loss'], label='Training loss')
plt.plot(history.history['val_loss'], label='Validation loss')
plt.legend()
plt.show()
# 모델 훈련 후에 예측 생성
train_predict = model.predict(X_train)
test_predict = model.predict(X_test)
# 평가 지표 계산
train_score = np.sqrt(mean_squared_error(y_train, train_predict[:, 0]))
print('훈련 RMSE: %.2f' % (train_score))
test_score = np.sqrt(mean_squared_error(y_test, test_predict[:, 0]))
print('테스트 RMSE: %.2f' % (test_score))
6) 특이점에 대해 살펴보자
1. 요일별
# @title 요일
from matplotlib import pyplot as plt
import seaborn as sns
mapo_df.groupby('요일').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)
2. timestamp vs 건수
# @title timestamp vs 건수
from matplotlib import pyplot as plt
import seaborn as sns
def _plot_series(series, series_name, series_index=0):
from matplotlib import pyplot as plt
import seaborn as sns
palette = list(sns.palettes.mpl_palette('Dark2'))
xs = series['timestamp']
ys = series['건수']
plt.plot(xs, ys, label=series_name, color=palette[series_index % len(palette)])
fig, ax = plt.subplots(figsize=(10, 5.2), layout='constrained')
df_sorted = mapo_df.sort_values('timestamp', ascending=True)
for i, (series_name, series) in enumerate(df_sorted.groupby('요일')):
_plot_series(series, series_name, i)
fig.legend(title='요일', bbox_to_anchor=(1, 1), loc='upper left')
sns.despine(fig=fig, ax=ax)
plt.xlabel('timestamp')
_ = plt.ylabel('건수')