Overview
Python ํ๊ฒฝ์์ ๋ฐ์ดํฐ ๋ถ์์ ์ํด ์ฌ์ฉํ ์ ์๋ ๋ค์ํ ์ฝ๋๋ธ๋ญ์ ๋ชจ์๋์๋ค.
Platform: Python
First upload date: 2025-12-03
I. Useful Things
# ๊ธฐ๋ณธ ์ธํ
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# ๊ธฐํ ์ค์
plt.rc('font',family='Malgun Gothic') # ํ๊ธ ๊ธ๊ผด ์ค์
plt.rc('axes', unicode_minus=False) # minus ์ถ๋ ฅ ์๋ฌ ๊ต์
pd.options.display.max_rows = 20 # 20row ๊น์ง๋ง ์ถ๋ ฅ
pd.set_option('display.max_columns', None) # ๋ชจ๋ ์ด์ ์ถ๋ ฅII. Data Collection
1. Crawling
1) requests library
# import library
import requests
# basic example
url = 'https://www.naver.com'
res = requests.get(url)
res.status_code # 200 is success
res.text # ๊ฒฐ๊ณผํ์ธ
# url encoding
url_base = 'https://www.naver.com/search'
dic_params = {'query':'์ต์ ์๊ธ', 'sort':0}
res = request.get(url_base, params = dic_params)
val_search_text_encoded = resquest.utils.quote('์ต์ ์๊ธ') # ์ธ์ฝ๋ฉ
url = f'{url_base}?query={val_search_text_encoded}&sort=0' # ์ธ์ฝ๋ฉ ์ ์ฉ๋ url2) bs4 library
from bs4 import BeautifulSoup as bs
# parsing test
text = '<html><div>bs4!!</div></html>'
text_bs = bs(text, 'html.parser')
text_bs.text # ๊ฒฐ๊ณผํ์ธ (์ถ๋ ฅ๊ฒฐ๊ณผ: 'bs4!!')
# ์ฃผ์ ๋ฌธ๋ฒ .select(): ํน์ ํ๊ทธ ์ ๊ทผ, select_one(): ํ๋๋ง ์ ๊ทผ
text = bs('<body><div></div></body>', features = 'html.parser')
text.select('body') # body ํ๊ทธ
text.select('body div') # body ํ๊ทธ ํ์์ div ํ๊ทธ
text.select('body > div') # body ํ๊ทธ ๋ฐ๋ก ์๋์ div ํ๊ทธ
text.select('.news') # class๋ช
์ด news์ธ ํ๊ทธ
text.select('.news.box') # class๋ช
์ด news์ด๋ฉด์ box์ธ ํ๊ทธ
text.select('div.news') # divํ๊ทธ ์ค class๋ช
์ด news์ธ ํ๊ทธ
text.select('#pw') # ID๊ฐ pw์ธ ํ๊ทธ
text.select('div#pw') # ID๊ฐ pw์ธ divํ๊ทธ
text.select('a[href]') # aํ๊ทธ ์ค href์์ฑ์ด ์๋ ํ๊ทธ
# ์์: ๋ค์ด๋ฒ ๋ด์ค ํค๋๋ผ์ธ/๋งํฌ ํฌ๋กค๋ง
url_naver = 'https://search.naver.com/search.naver?ssc=tab.news.all&where=news&sm=tab_jum&query=%EC%B5%9C%EC%A0%80%EC%9E%84%EA%B8%88'
res = requests.get(url_naver)
bs_res = bs(res.text, 'html.parser')
ls_news = bs_res.select('div.group_news > ul.list_news')
ls_title = ls_news[0].select('span.sds-comps-text-type-headline1')
ls_title2 = []
for i in range(len(ls_title)):
ls_title2.append(ls_title[i].text)
ls_title2 # news ๊ธฐ์ฌ ํค๋๋ผ์ธ๋ง ๋จ๋๋ค
ls_news_links = ls_news[0].select("div.sds-comps-base-layout > div > a[nocr='1']")
ls_news_links[0]['href'] # news๊ธฐ์ฌ ๋งํฌ๋ง ๋จ๋๋ค3) Selenium library
from selenium import webdriver # ์
๋ ๋์์ ๋ธ๋ผ์ฐ์ ์ ์ด ํต์ฌ ๊ฐ์ฒด(๋๋ผ์ด๋ฒ ์์ฑ/์ ์ด)
from selenium.webdriver.chrome.service import Service # ChromeDriver ์คํ ํ๋ก์ธ์ค๋ฅผ ๊ฐ์ธ๋ ์๋น์ค ๋ํผ
from selenium.webdriver.chrome.options import Options # ํฌ๋กฌ ์คํ ์ต์
(ํค๋๋ฆฌ์ค, UA, ์ธ์ด ๋ฑ) ์ค์
from webdriver_manager.chrome import ChromeDriverManager # ํฌ๋กฌ๋๋ผ์ด๋ฒ ์๋ ๋ค์ด๋ก๋/๋ฒ์ ๊ด๋ฆฌ
from selenium.webdriver.support.ui import WebDriverWait # ๋ช
์์ ๋๊ธฐ(์กฐ๊ฑด ๋ง์กฑ๊น์ง ๋๊ธฐ) ์ ํธ
from selenium.webdriver.support import expected_conditions as EC # WebDriverWait์ ํจ๊ป ์ฐ๋ ์กฐ๊ฑด ๋ชจ์
from selenium.webdriver.common.by import By # ์์ ํ์ ์ ๋ต ์ง์ ์ฉ( By.ID, By.CSS_SELECTOR ๋ฑ )
from selenium.webdriver.common.keys import Keys # ํค๋ณด๋ ์
๋ ฅ(ENTER, TAB ๋ฑ) ์์
from selenium.webdriver.support.ui import Select # <select> ๋๋กญ๋ค์ด ์ ์ฉ ํฌํผ(์ต์
์ ํ/์กฐํ)
# ๊ธฐ๋ณธ ๊ตฌ๋
service = Service(ChromeDriverManager().install())
drv = webdriver.Chrome(service = service)
wait = WebDriverWait(drv, 10)
# ์์: ๋ฌธ์ฒด๋ถ ๊ตญ๋ฆฝ์ฅ์ ์ธ๋์๊ด ๋ฐ์ดํฐ ํฌ๋กค๋ง
drv.get("https://www.nld.go.kr/home/libraryPossession.do?menu=menu_03_03&low=N")
val_page = 1 # ํฌ๋กค๋ง page๋ฒํธ ์ค์
drv.execute_script(f"tabGB({val_page});") # val_page๋ก ์ด๋
res = drv.page_source # requests.get(url)๊ณผ ์ ์ฌ
bs_res = bs(res, "html.parser")
ls_thead = [t.text for t in bs_res.select("thead th")] # columns ๋ฆฌ์คํธ ๋ง๋ค๊ธฐ
bs_tbody_tr = bs_res.select("tbody > tr")
ls_tbody = [[t.text for t in bs_tbody_tr_1] for bs_tbody_tr_1 in bs_tbody_tr]
df_tbl = pd.DataFrame(ls_tbody, columns = ls_thead) # val_page์ ๋ฐ์ดํฐ ํฌ๋กค๋ง ์๋ฃ.
# scroll container ์ง์
scroll_by = drv.find_element(By.CSS_SELECTOR, 'div#scroll_container') scroll_by.send_keys(Keys.End) # End ํค
scroll_by.send_keys(Keys.PaheDn) # PageDn ํค2. API
1) basic form
import requests
# ์์: ๊ตญํ ๊ตํต๋ถ ์ํํธ ์ค๊ฑฐ๋๊ฐapi
service_key = 'your_key'
url = 'https://apis.data.go.kr/1613000/RTMSDataSvcAptTrade/getRTMSDataSvcAptTrade'
params = {
"serviceKey": service_key, # ํ์: ์๋น์ค ์ธ์ฆํค
"LAWD_CD": "11110", # ํ์: ํ์ ๊ตฌ์ญ ์ฝ๋
"DEAL_YMD": "202407", # ํ์: ์กฐํ๋
์ (YYYYMM)
"pageNo": "1", # ์ต์
: ํ์ด์ง ๋ฒํธ
"numOfRows": "100" # ์ต์
: ํ ํ์ด์ง ๊ฒฐ๊ณผ ์
}2) GCP geocoding
import googlemaps
# ์ฃผ์ -> ์/๊ฒฝ๋ ์ขํ ๋ณํ
key = 'your_key'
addr = '์์ธ์ญ'
gmaps = googlemaps.Client(key=key)
res_cd = gmaps.geocode(addr)
lat = res_cd[0]['geometry']['location']['lat']
lng = res_cd[0]['geometry']['location']['lng']
def gcp_geocoding(addr): # ์ฌ์ฉ์ ์ ์ํจ์
gmaps = googlemaps.Client(key = key)
res_cd = gmaps.geocode(addr)
val_lat = res_cd[0]["geometry"]["location"]["lat"]
val_lng = res_cd[0]["geometry"]["location"]["lng"]
return [val_lat, val_lng]
df[['lat', 'lon']] = df['addr'].apply(gcp_geocoding).apply(pd.Series)
# ์/๊ฒฝ๋ -> ์ฃผ์ ๋ณํ (reverse_geocode)
key_geo = 'your_key'
gmaps = googlemaps.Client(key=key_geo)
def gcp_rev_geocoding(lat, lon):
result = gmaps.reverse_geocode((latitude, longitude))
dic_addr = result[0]['formatted_address']
return dic_addr # ์ฃผ์ ๋ฌธ์์ด ๋ฐํ3) kakao map api
import request
# kakao api ํธ์ถ
url = "https://dapi.kakao.com/v2/local/search/address.json" #์์ฒญํ url ์ฃผ์
Key = 'your_rest_api_key' #REST API ํค(์ ํจํ ํค)
headers = {"Authorization": f"KakaoAK {key}"}
addr = '์์ธํน๋ณ์ ์ข
๋ก๊ตฌ ์ฒญ์๋๋ก 1'
result = requests.get(url, headers=headers,
params = {'query': addr}).json()
region_1depth = result['documents'][0]['address']['region_1depth_name']
region_2depth = result['documents'][0]['address']['region_2depth_name']
lon = result['documents'][0]['address']['x']
lat = result['documents'][0]['address']['y']III. Data Analysis
1. EDA
1) ์๊ด๋ถ์
df = dataframe.corr() # ์๊ด๊ด๊ณ
sns.heatmap(df, cmap = 'RdYlBu_r', # ํ~๋นจ
annot = True, # ์ค์ ๊ฐ์ ํ์ํ๋ค
mask= df < 0.2, # ํ์ํ์ง ์์ ๋ง์คํฌ ๋ถ๋ถ์ ์ง์ ํ๋ค
linewidths=.5, # ๊ฒฝ๊ณ๋ฉด ์ค์ ์ผ๋ก ๊ตฌ๋ถํ๊ธฐ
cbar_kws={"shrink": .5},# ์ปฌ๋ฌ๋ฐ ํฌ๊ธฐ ์ ๋ฐ์ผ๋ก ์ค์ด๊ธฐ
vmin = -1,vmax = 1 # ์ปฌ๋ฌ๋ฐ ๋ฒ์ -1 ~ 1 )2. Text Mining
1) ๋น๋ ๋ถ์
# ๋จ์ด ํ ํฐํ
ser_msg = df_msg.loc[, 'MSG_CN'] # ๋ฌธ์์ด Series
ser_msg = ser_msg.str.replace("\\(.*?\\)", "", regex=True) # ๊ดํธ () ์์ ๋ชจ๋ ๋ด์ฉ์ ์ ๊ฑฐ. ์: "(์ฌ์ง=๋ด์ค1)" โ ""
ser_msg = ser_msg.str.replace("\\{.*?\\}", "", regex=True) # ์ค๊ดํธ {} ์์ ๋ชจ๋ ๋ด์ฉ์ ์ ๊ฑฐ
ser_msg = ser_msg.str.replace("\\[.*?\\]", "", regex=True) # ๋๊ดํธ [] ์์ ๋ชจ๋ ๋ด์ฉ์ ์ ๊ฑฐ
ser_msg = ser_msg.str.replace("[^๊ฐ-ํฃA-Za-z0-9]", " ", regex=True) # ํ๊ธ๊ณผ ์์ด๋ฅผ ์ ์ธํ ๋ชจ๋ ๋ฌธ์(์ซ์, ํน์๋ฌธ์ ๋ฑ)๋ฅผ ๊ณต๋ฐฑ์ผ๋ก ๋์ฒด
ser_msg = ser_msg.str.replace(" {2,}", " ", regex=True) # ๋ ์นธ ์ด์์ ์ฐ์๋ ๊ณต๋ฐฑ์ ํ๋์ ๊ณต๋ฐฑ์ผ๋ก ์ถ์
ser_msg = ser_msg.str.replace("^ | $", "", regex=True) # ๋ฌธ์ฅ ๋งจ ์ ๋๋ ๋งจ ๋ค์ ์๋ ๊ณต๋ฐฑ ์ ๊ฑฐ
# ๋จ์ด ๋น๋์ DataFrame
ser_msg = ser_msg.str.split(" ").explode()
ser_msg_cnt = ser_msg.value_counts()
ser_msg_cnt.index.name = "word"
df_msg_cnt = ser_msg_cnt.reset_index()
# ๋ถ์ฉ์ด ์ ๊ฑฐ
df_msg_cnt_cut = df_msg_cnt.loc[df_msg_cnt["word"].str.len() >= 2, ] # ํ์ฉ ๋จ์ด ๊ธธ์ด ์ง์
val_regex = "(๋ค|๋|๊ณ |์|์)$" # ๋ถ์ฉ์ด(๋์ฌ ๋ฑ) ์ง์
msg_cnt_cut = df_msg_cnt_cut.loc[~df_msg_cnt_cut["word"].str.contains(val_regex), ] # ๋ถ์ฉ์ด ์ ์ธfrom wordcloud import WordCloud
dic_cmt_cnt = dict(zip(df_msg_cnt_cut["word"], df_msg_cnt_cut["count"]))
#mask = np.array(Image.open("image.png")) # wordcloud ๋ง์คํฌ ์ง์
obj_wc = WordCloud(font_path = "C:/Windows/Fonts/malgun.ttf",
width = 400, height = 400,
max_font_size = 120,
min_font_size = 10,
prefer_horizontal=0.5,
mask = mask,
colormap='YlGnBu',
background_color = "#FFFFFF",
random_state = 123)
obj_wc = obj_wc.generate_from_frequencies(dic_cmt_cnt)
plt.figure(figsize=(16,8))
plt.imshow(obj_wc)
plt.axis("off")
plt.show()2) ํํ์ ๋ถ์ using Kiwi
from kiwipiepy import Kiwi
kiwi = Kiwi()
text = '๋ ๋น์ฅ ์ง์ ๊ฐ๊ณ ์ถ์ด'
tokens = kiwi.tokenize(text) # ํํ์ ๋๋ ์ง
# ์๋ํด๋ผ์ฐ๋ ๋ฑ ๋น๋๋ถ์์ ์ํ ๋ช
์ฌ, ํ์ฉ์ฌ ํํฐ๋ง
def udf_kiwi_nv_tokenizer(text):
if pd.notna(text):
tokens = kiwi.tokenize(text)
existed_text = []
for val_token in tokens:
if val_token.tag in ['NNG','NNP','VA']:
existed_text.append(val_token.form)
return existed_text
else:
return np.nan3) ๊ฐ์ฑ ๋ถ์ using HuggingFace
import os
from transformers import pipeline # hugging face
# hugging face api key
os.environ["HF_TOKEN"] = 'your key'
# Korean_sentiment ๊ฐ์ฑ ๋ถ์ ๋ชจ๋ธ ๊ฐ์ ธ์ค๊ธฐ ๋ฐ ์๋ ํ์ธ
classifier = pipeline("text-classification", model="matthewburke/korean_sentiment")
custom_tweet = "์ํ ์ฌ๋ฐ๋ค."
preds = classifier(custom_tweet, return_all_scores=True)
is_positive = preds[0][1]['score'] > 0.5
print(is_positive)
# UDF ์์ฑ
def review_PN_classifier(text):
preds = classifier(text, return_all_scores=True)
n_score = round(preds[0][0]['score'], 2)
p_score = round(preds[0][1]['score'], 2)
return n_score, p_score # df['text'].apply(review_PN_classifier).apply(pd.Series)3. GIS ๋ถ์
1) Geopandas
import geopandas as gpd
df_geo = gpd.read_file('map_sido.shp', encoding='cp949')
ser_cent = df_geo['geometry'].centroid # return ์/๊ฒฝ๋ tuple
df_loc = pd.DataFrame(dict(loc = df_geo['CTP_KOR_NM'],
lon = ser_cent.x,
lat = ser_cent.y))2) Folium
import folium
# ์์ ๋ฐ์ดํฐํ๋ ์ ์์ฑ
df = pd.DataFrame(dict(
lat=[37.5665, 37.5670, 37.5650],
lon=[126.9780, 126.9790, 126.9770],
col = ['green','red','blue'],
icon=["image", "poo", "wand-magic-sparkles"]
))
# ์์ธ ์ง๋ ์์ฑ
m = folium.Map(location = [37.6, 126.8], tiles='cartodbpostron', attr='Google')
for idx, row in df.iterrows():
folium.Marker(
location=[row['lat'], row['lon'],
icon=folium.Icon(icon=row['icon'], color=row['col'], prefix='fa')).add_to(m)