파이썬과 streamlit을 이용한 데이터 분석

Notice

Recent Posts

Recent Comments

Link

« 2025/08 »
일	월	화	수	목	금	토
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30
31

Tags more

Archives

Today

Total

관리 메뉴

코딩하는 덕구 🐶

파이썬과 streamlit을 이용한 데이터 분석 본문

데이터

파이썬과 streamlit을 이용한 데이터 분석

코딩하는 덕구 🐶 2024. 11. 27. 09:41

728x90

data_processing.py

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import io

# 이상치를 제거하는 함수

def remove_outliers(df):

for column in df.columns:

# 열이 숫자형 데이터를 포함하고 있는지 확인

if df[column].dtype == 'float64' or df[column].dtype == 'int64':

Q1 = df[column].quantile(0.25)

Q3 = df[column].quantile(0.75)

IQR = Q3 - Q1

# IQR에 기반한 이상치의 범위 정의

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

# 조건에 맞지 않는 (즉, 이상치인) 데이터 필터링

df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

return df

def convert_temperature(df) :

df['time_stamp'] = pd.to_datetime(df['time_stamp'])

plot_data = df[['time_stamp', 'AmbientConditions.AmbientHumidity.U.Actual', 'AmbientConditions.AmbientTemperature.U.Actual']]

return plot_data

df = pd.read_csv('./data/continuous_factory_process.csv')

buffer = io.StringIO()

df.info(buf=buffer)

s = buffer.getvalue()

df_std = df.std(numeric_only=True)

std0_columns = []

for idx, std in list(zip(df_std.index, df_std)):

if std <= 0.001:

std0_columns.append(idx)

else:

pass

df2 = df.drop(std0_columns, axis=1)

std0_columns2 = std0_columns.append("time_stamp")

df_corr = df2.drop("time_stamp", axis=1)

main.py

import streamlit as st

import pandas as pd

import matplotlib.pyplot as plt

import seaborn as sns

import io

import texts

import data_processing as dp

st.set_page_config(layout="wide")

st.write(texts.title_text)

st.divider()

st.write(texts.ch1_text)

col1, col2 = st.columns([0.3, 0.7], gap="small")

with col1 :

data_check_methods = st.radio(

"데이터 확인 방법",

["info", "Random", "Describe", "Full"],

captions = ["데이터 개요를 확인할 수 있습니다.", "무작위 데이터 10개를 확인할 수 있습니다.", "데이터의 기술 통계량을 확인할 수 있습니다.","데이터 전체를 확인할 수 있습니다."]

)

with col2 :

if data_check_methods == "Full" :

st.dataframe(dp.df)

elif data_check_methods == "Random" :

st.dataframe(dp.df.sample(10))

elif data_check_methods == "Describe" :

st.dataframe(dp.df.describe())

else :

st.text(dp.s)

st.write(texts.ch1_text2)

st.divider()

st.write(texts.ch2_text)

temp_df = dp.remove_outliers(dp.df)

time_col1, time_col2 = st.columns(2)

with time_col1 :

fig, ax = plt.subplots(figsize=(10, 3))

ax.plot(temp_df['time_stamp'], temp_df['AmbientConditions.AmbientHumidity.U.Actual'], color="blue")

ax.set_title("AmbientConditions.AmbientHumidity.U.Actual")

plt.xticks(temp_df['time_stamp'][::17], rotation=45)

st.pyplot(fig)

fig, ax = plt.subplots(figsize=(10, 3))

ax.plot(temp_df['time_stamp'], temp_df['AmbientConditions.AmbientTemperature.U.Actual'], color="red")

ax.set_title("AmbientConditions.AmbientTemperature.U.Actual")

plt.xticks(temp_df['time_stamp'][::17], rotation=45)

st.pyplot(fig)

with time_col2 :

to_remove = ["time_stamp", "AmbientConditions.AmbientHumidity.U.Actual", "AmbientConditions.AmbientTemperature.U.Actual"]

filtered_list = [x for x in list(dp.df.columns) if x not in to_remove]

select_data = st.selectbox(

"선 그래프로 보고싶은 데이터를 선택하세요",

(filtered_list)

)

fig, ax = plt.subplots(figsize=(10, 7))

ax.plot(temp_df['time_stamp'], temp_df[select_data], color="green")

ax.set_title(select_data)

plt.xticks(temp_df['time_stamp'][::17], rotation=45)

st.pyplot(fig)

st.write(texts.ch2_text2)

col3, col4 = st.columns([0.3, 0.7], gap="small")

with col3 :

corr_check_methods = st.radio(

"상관관계 확인 방법",

["Table", "Heatmap"],

captions = ["표 형식으로 확인합니다.", "히트맵 그래프로 확인합니다."]

)

if corr_check_methods == "Table" :

st.text_area("상관관계란?",

"두 변수 간의 관계의 강도와 방향을 수량화하는 통계적 방법입니다. 상관관계를 통해 한 변수의 변화가 다른 변수의 변화와 어떤 관계를 가지는지를 알 수 있으며, 이는 변수들 사이의 선형적 관계를 나타냅니다. 상관관계는 -1에서 +1 사이의 값을 가지며, 이를 상관 계수(Correlation Coefficient)라고 합니다.",

height=200)

else :

st.text_area("히트맵이란?",

"데이터의 매트릭스를 색상의 변화를 통해 시각적으로 표현한 그래프입니다. 이는 데이터의 패턴, 변화, 밀도 등을 한눈에 파악하기 좋게 해주며, 특히 대규모 데이터 세트 내에서 복잡한 관계와 구조를 이해하는 데 유용합니다. 히트맵은 다양한 분야에서 사용되며, 특히 데이터 과학, 통계 분석, 기계 학습, 바이오인포매틱스 등에서 널리 적용됩니다.",

height=200)

with col4 :

if corr_check_methods == "Table" :

st.dataframe(dp.df_corr.corr())

else :

fig, ax = plt.subplots()

cax = ax.pcolor(dp.df_corr.corr())

# colorbar 추가

fig.colorbar(cax)

st.pyplot(fig)

st.write(texts.ch2_text3)

col5, col6 = st.columns(2)

with col5 :

variable = st.selectbox(

"변수 선택",

("Stage2.Output.Measurement1.U.Actual", "Stage2.Output.Measurement2.U.Actual", "Stage2.Output.Measurement3.U.Actual", "Stage2.Output.Measurement4.U.Actual", "Stage2.Output.Measurement5.U.Actual",

"Stage2.Output.Measurement6.U.Actual", "Stage2.Output.Measurement7.U.Actual", "Stage2.Output.Measurement8.U.Actual", "Stage2.Output.Measurement9.U.Actual", "Stage2.Output.Measurement10.U.Actual",

"Stage2.Output.Measurement11.U.Actual", "Stage2.Output.Measurement12.U.Actual", "Stage2.Output.Measurement13.U.Actual", "Stage2.Output.Measurement14.U.Actual"),

)

bin = st.slider('histogram bins', 5, 40, 20, step=1)

st.dataframe(dp.df[variable].describe(), width=700)

with col6 :

agree = st.toggle("이상치 제거한 데이터 확인하기")

if agree :

clear_df = dp.remove_outliers(dp.df)

fig, ax = plt.subplots(figsize=(9, 4))

ax.set_xlabel(variable)

ax.set_ylabel('counts')

ax.hist(clear_df[variable], bins=bin)

st.pyplot(fig)

fig, ax = plt.subplots(figsize=(6, 2))

ax.set_ylabel(variable)

ax.boxplot(clear_df[variable], vert=False)

st.pyplot(fig)

else :

fig, ax = plt.subplots(figsize=(9, 4))

ax.set_xlabel(variable)

ax.set_ylabel('counts')

ax.hist(dp.df[variable], bins=bin)

st.pyplot(fig)

fig, ax = plt.subplots(figsize=(6, 2))

ax.set_ylabel(variable)

ax.boxplot(dp.df[variable], vert=False)

st.pyplot(fig)

st.write(texts.ch2_text4)

col7, col8 = st.columns(2)

with col7 :

option = st.selectbox(

'Machine 1, 2, 3 원재료 변수 시각화',

('Machine1', 'Machine2', 'Machine3')

)

selected_list = []

color = None

if option == 'Machine1':

selected_list = ['Machine1.RawMaterial.Property1','Machine1.RawMaterial.Property2', 'Machine1.RawMaterial.Property3', 'Machine1.RawMaterial.Property4']

color = "blue"

elif option == 'Machine2':

selected_list = ['Machine2.RawMaterial.Property1','Machine2.RawMaterial.Property2', 'Machine2.RawMaterial.Property3', 'Machine2.RawMaterial.Property4']

color = "orange"

elif option == 'Machine3':

selected_list = ['Machine3.RawMaterial.Property1','Machine3.RawMaterial.Property2', 'Machine3.RawMaterial.Property3', 'Machine3.RawMaterial.Property4']

color = "green"

fig, ax = plt.subplots(2, 2)

# 선택된 리스트에 따라 히스토그램 생성

for i, variable in enumerate(selected_list):

dp.df.hist(column=variable, bins=30, ax=ax[i//2, i%2], color=color)

ax[i//2, i%2].set_title(variable)

fig.tight_layout()

st.pyplot(fig)

with col8 :

option2 = st.selectbox(

'Machine 1, 2, 3 설비 변수 시각화',

('Machine1', 'Machine2', 'Machine3')

)

selected_machine_list = []

if option2 == 'Machine1':

selected_machine_list = ['Machine1.Zone1Temperature.C.Actual','Machine1.Zone2Temperature.C.Actual', 'Machine1.MotorAmperage.U.Actual', 'Machine1.MotorRPM.C.Actual', 'Machine1.MaterialPressure.U.Actual', 'Machine1.MaterialTemperature.U.Actual','Machine1.ExitZoneTemperature.C.Actual']

color = "blue"

elif option2 == 'Machine2':

selected_machine_list = ['Machine2.Zone1Temperature.C.Actual','Machine2.Zone2Temperature.C.Actual', 'Machine2.MotorAmperage.U.Actual', 'Machine2.MotorRPM.C.Actual', 'Machine2.MaterialPressure.U.Actual', 'Machine2.MaterialTemperature.U.Actual','Machine2.ExitZoneTemperature.C.Actual']

color = "orange"

elif option2 == 'Machine3':

selected_machine_list = ['Machine3.Zone1Temperature.C.Actual','Machine3.Zone2Temperature.C.Actual', 'Machine3.MotorAmperage.U.Actual', 'Machine3.MotorRPM.C.Actual', 'Machine3.MaterialPressure.U.Actual', 'Machine3.MaterialTemperature.U.Actual','Machine3.ExitZoneTemperature.C.Actual']

color = "green"

fig, ax = plt.subplots(4, 2)

for i, variable in enumerate(selected_machine_list):

dp.df.hist(column = variable, bins = 30, ax=ax[i//2, i%2], color=color)

ax[i//2, i%2].set_title(variable, fontsize=10)

fig.tight_layout()

st.pyplot(fig)

st.divider()

st.write(texts.ch3_text)

st.download_button(

label="Download data as CSV",

data=dp.df.to_csv(),

file_name='data.csv',

mime='csv',

)

texts.py

title_text = """

# 공정 Sensing 데이터 분석 프로젝트

해당 실습은 제조 공정에서 발생한 Sensing 데이터를 분석하고 시각화하는 실습입니다.

Streamlit 프레임워크를 이용하여 구성하였으며 복잡한 코드 없이 다양한 위젯들을 구성할 수 있었습니다.

**[데이터 출처]**

- Multi-stage continuous-flow manufacturing process dataset (Real process data to predict factory output)

- https://www.kaggle.com/supergus/multistage-continuousflow-manufacturing-process

"""

ch1_text = """

### 1. 데이터 확인하기

분석하려는 데이터는 어떻게 생겼는지 확인해 보겠습니다. 아래의 4가지 버튼 중 하나를 클릭하면 데이터를 확인할 수 있습니다.

"""

ch1_text2 = """

---

* `<class 'pandas.core.frame.DataFrame'>`: 이 객체는 pandas의 DataFrame 타입임을 나타냅니다. DataFrame은 행과 열로 이루어진 2차원 데이터 구조로, 엑셀 시트나 SQL 테이블과 유사합니다.

* `RangeIndex: 14088 entries, 0 to 14087`: 이 데이터 프레임은 14,088개의 행을 가지고 있으며, 인덱스는 0부터 시작해 14,087까지입니다. RangeIndex는 정수 기반의 인덱싱을 의미합니다.

* `Columns: 116 entries, time_stamp to Stage2.Output.Measurement14.U.Setpoint`: 총 116개의 열이 있으며, 첫 번째 열의 이름은 time_stamp이고, 마지막 열의 이름은 Stage2.Output.Measurement14.U.Setpoint입니다.

* `dtypes: float64(108), int64(7), object(1)`: 이 데이터 프레임에는 세 가지 유형의 데이터 타입이 포함되어 있습니다. float64 타입의 데이터가 108개의 열에, int64 타입의 데이터가 7개의 열에, 그리고 문자열이나 다른 파이썬 객체를 포함할 수 있는 object 타입의 데이터가 1개의 열에 사용되었습니다.

* `memory usage: 12.5+ MB`: 이 데이터 프레임이 메모리에서 차지하는 크기는 약 12.5MB 이상입니다. + 기호는 실제 메모리 사용량이 이보다 더 클 수 있음을 의미합니다.

"""

ch2_text = """

### 2. 데이터 분석하기

본격적으로 데이터를 분석하기 위해서는 많은 관점에서 데이터를 확인할 필요가 있습니다. 데이터 분석을 위해 두 가지를 중점으로 확인해 보겠습니다.

1. **"시간"에 따른 데이터 확인하고 시각화하여 확인하기**

2. **"상관관계"를 확인하고 이를 시각화하여 확인하기**

3. **"데이터 분포"를 확인하고 이를 시각화하여 확인하기**

#### 2.1 "시간"에 따른 데이터 시각화하여 확인하기

`time_stamp` 컬럼을 확인하면 시간 데이터로 구성되어 있습니다. 시간에 따른 습도와 온도 데이터를 확인하고 이에 따른 제조 데이터를 확인해 보겠습니다.

* 좌측의 파란 그래프는 습도, 빨간 그래프는 온도를 의미합니다.

* 우측의 위젯을 통해 원하는 데이터를 선 그래프로 표현할 수 있습니다.

"""

ch2_text2 = """

#### 2.2 "상관관계"를 확인하고 이를 시각화하여 확인하기

선그래프만으로는 제조 공정에 온도와 습도로 인해 영향이 발생되는지 확인하기가 어렵습니다. 이를 한 눈에 확인하기 위해 "상관관계" 라는 것을 이용하여 확인해보도록 하겠습니다.

아래의 버튼을 이용하면 상관관계를 확인할 수 있으며 이를 시각적으로 표현한 그래프를 확인할 수 있습니다.

"""

ch2_text3 = """

#### 2.3 "데이터 분포"를 확인하고 이를 시각화하여 확인하기

상관관계를 분석하기 위해서는 통계적인 개념이 필요하며 현재 데이터가 방대하여 분석하기가 어렵습니다.

데이터 분포를 확인하기 위해 `히스토그램(Histogram)`과 `박스플롯(boxplot)`을 이용하여 확인해 보겠습니다.

* `히스토그램(Histogram)`을 이용하면 데이터의 분포를 쉽게 확인할 수 있습니다. 아래의 슬라이더를 조절하면 막대의 개수를 조절할 수 있습니다.

* `박스플롯(boxplot)`을 이용하면 데이터의 간단한 통계 정보와 함께 이상치를 확인할 수 있습니다. 아래의 표는 선택한 변수의 간단한 통계 정보를 확인할 수 있습니다.

* 이상치를 제거한 데이터의 그래프를 보고 싶으면 상단의 체크박스를 클릭합니다.

"""

ch2_text4 = """

이번에는 다양한 데이터를 한 눈에 확인해 보겠습니다.

* 좌측은 Machine 1, 2, 3에 따른 원재료 데이터를 시각화 자료입니다.

* 우측은 Machine 1, 2, 3에 따른 설비 데이터를 시각화 자료입니다.

각 상단의 메뉴를 이용하면 Machine 1, 2, 3에 따른 데이터를 확인할 수 있습니다.

"""

ch3_text = """

### 3. 데이터 저장하기

데이터를 로컬 파일로 저장하고 싶다면 아래의 버튼을 누르면 저장할 수 있습니다.

"""

728x90

저작자표시 (새창열림)

코딩하는 덕구 🐶

파이썬과 streamlit을 이용한 데이터 분석 본문

파이썬과 streamlit을 이용한 데이터 분석

data_processing.py

main.py

texts.py

티스토리툴바