Overview

Python ํ™˜๊ฒฝ์—์„œ ๋ฐ์ดํ„ฐ ๋ถ„์„์„ ์œ„ํ•ด ์‚ฌ์šฉํ•  ์ˆ˜ ์žˆ๋Š” ๋‹ค์–‘ํ•œ ์ฝ”๋“œ๋ธ”๋Ÿญ์„ ๋ชจ์•„๋‘์—ˆ๋‹ค.

Platform: Python

First upload date: 2025-12-03


I. Useful Things

Basic Setting
# ๊ธฐ๋ณธ ์„ธํŒ…
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
 
# ๊ธฐํƒ€ ์„ค์ •
plt.rc('font',family='Malgun Gothic') # ํ•œ๊ธ€ ๊ธ€๊ผด ์„ค์ •
plt.rc('axes', unicode_minus=False) # minus ์ถœ๋ ฅ ์—๋Ÿฌ ๊ต์ •
pd.options.display.max_rows = 20 # 20row ๊นŒ์ง€๋งŒ ์ถœ๋ ฅ
pd.set_option('display.max_columns', None) # ๋ชจ๋“  ์—ด์„ ์ถœ๋ ฅ

II. Data Collection

1. Crawling

1) requests library

# import library
import requests
 
# basic example
url = 'https://www.naver.com'
res = requests.get(url)
res.status_code # 200 is success
res.text # ๊ฒฐ๊ณผํ™•์ธ
 
# url encoding
url_base = 'https://www.naver.com/search'
dic_params = {'query':'์ตœ์ €์ž„๊ธˆ', 'sort':0}
res = request.get(url_base, params = dic_params)
val_search_text_encoded = resquest.utils.quote('์ตœ์ €์ž„๊ธˆ') # ์ธ์ฝ”๋”ฉ
url = f'{url_base}?query={val_search_text_encoded}&sort=0' # ์ธ์ฝ”๋”ฉ ์ ์šฉ๋œ url

2) bs4 library

from bs4 import BeautifulSoup as bs
 
# parsing test
text = '<html><div>bs4!!</div></html>'
text_bs = bs(text, 'html.parser')
text_bs.text # ๊ฒฐ๊ณผํ™•์ธ (์ถœ๋ ฅ๊ฒฐ๊ณผ: 'bs4!!')
 
# ์ฃผ์š” ๋ฌธ๋ฒ• .select(): ํŠน์ • ํƒœ๊ทธ ์ ‘๊ทผ, select_one(): ํ•˜๋‚˜๋งŒ ์ ‘๊ทผ
text = bs('<body><div></div></body>', features = 'html.parser')
text.select('body') # body ํƒœ๊ทธ
text.select('body div') # body ํƒœ๊ทธ ํ•˜์œ„์˜ div ํƒœ๊ทธ
text.select('body > div') # body ํƒœ๊ทธ ๋ฐ”๋กœ ์•„๋ž˜์˜ div ํƒœ๊ทธ
text.select('.news') # class๋ช…์ด news์ธ ํƒœ๊ทธ
text.select('.news.box') # class๋ช…์ด news์ด๋ฉด์„œ box์ธ ํƒœ๊ทธ
text.select('div.news') # divํƒœ๊ทธ ์ค‘ class๋ช…์ด news์ธ ํƒœ๊ทธ
text.select('#pw') # ID๊ฐ€ pw์ธ ํƒœ๊ทธ
text.select('div#pw') # ID๊ฐ€ pw์ธ divํƒœ๊ทธ
text.select('a[href]') # aํƒœ๊ทธ ์ค‘ href์†์„ฑ์ด ์žˆ๋Š” ํƒœ๊ทธ
 
# ์˜ˆ์‹œ: ๋„ค์ด๋ฒ„ ๋‰ด์Šค ํ—ค๋“œ๋ผ์ธ/๋งํฌ ํฌ๋กค๋ง
url_naver = 'https://search.naver.com/search.naver?ssc=tab.news.all&where=news&sm=tab_jum&query=%EC%B5%9C%EC%A0%80%EC%9E%84%EA%B8%88'
res = requests.get(url_naver)
bs_res = bs(res.text, 'html.parser')
ls_news = bs_res.select('div.group_news > ul.list_news')
ls_title = ls_news[0].select('span.sds-comps-text-type-headline1')
 
ls_title2 = []
for i in range(len(ls_title)):
    ls_title2.append(ls_title[i].text)
ls_title2 # news ๊ธฐ์‚ฌ ํ—ค๋“œ๋ผ์ธ๋งŒ ๋‚จ๋Š”๋‹ค
 
ls_news_links = ls_news[0].select("div.sds-comps-base-layout > div > a[nocr='1']")
ls_news_links[0]['href'] # news๊ธฐ์‚ฌ ๋งํฌ๋งŒ ๋‚จ๋Š”๋‹ค

3) Selenium library

from selenium import webdriver  # ์…€๋ ˆ๋‹ˆ์›€์˜ ๋ธŒ๋ผ์šฐ์ € ์ œ์–ด ํ•ต์‹ฌ ๊ฐ์ฒด(๋“œ๋ผ์ด๋ฒ„ ์ƒ์„ฑ/์ œ์–ด)
from selenium.webdriver.chrome.service import Service  # ChromeDriver ์‹คํ–‰ ํ”„๋กœ์„ธ์Šค๋ฅผ ๊ฐ์‹ธ๋Š” ์„œ๋น„์Šค ๋ž˜ํผ
from selenium.webdriver.chrome.options import Options  # ํฌ๋กฌ ์‹คํ–‰ ์˜ต์…˜(ํ—ค๋“œ๋ฆฌ์Šค, UA, ์–ธ์–ด ๋“ฑ) ์„ค์ •
from webdriver_manager.chrome import ChromeDriverManager  # ํฌ๋กฌ๋“œ๋ผ์ด๋ฒ„ ์ž๋™ ๋‹ค์šด๋กœ๋“œ/๋ฒ„์ „๊ด€๋ฆฌ
from selenium.webdriver.support.ui import WebDriverWait  # ๋ช…์‹œ์  ๋Œ€๊ธฐ(์กฐ๊ฑด ๋งŒ์กฑ๊นŒ์ง€ ๋Œ€๊ธฐ) ์œ ํ‹ธ
from selenium.webdriver.support import expected_conditions as EC  # WebDriverWait์™€ ํ•จ๊ป˜ ์“ฐ๋Š” ์กฐ๊ฑด ๋ชจ์Œ
from selenium.webdriver.common.by import By  # ์š”์†Œ ํƒ์ƒ‰ ์ „๋žต ์ง€์ •์šฉ( By.ID, By.CSS_SELECTOR ๋“ฑ )
from selenium.webdriver.common.keys import Keys  # ํ‚ค๋ณด๋“œ ์ž…๋ ฅ(ENTER, TAB ๋“ฑ) ์ƒ์ˆ˜
from selenium.webdriver.support.ui import Select  # <select> ๋“œ๋กญ๋‹ค์šด ์ „์šฉ ํ—ฌํผ(์˜ต์…˜ ์„ ํƒ/์กฐํšŒ)
 
# ๊ธฐ๋ณธ ๊ตฌ๋™
service = Service(ChromeDriverManager().install())
drv = webdriver.Chrome(service = service)
wait = WebDriverWait(drv, 10)
 
# ์˜ˆ์‹œ: ๋ฌธ์ฒด๋ถ€ ๊ตญ๋ฆฝ์žฅ์• ์ธ๋„์„œ๊ด€ ๋ฐ์ดํ„ฐ ํฌ๋กค๋ง
drv.get("https://www.nld.go.kr/home/libraryPossession.do?menu=menu_03_03&low=N")
val_page = 1 # ํฌ๋กค๋ง page๋ฒˆํ˜ธ ์„ค์ •
drv.execute_script(f"tabGB({val_page});") # val_page๋กœ ์ด๋™
res = drv.page_source # requests.get(url)๊ณผ ์œ ์‚ฌ
bs_res = bs(res, "html.parser")
ls_thead = [t.text for t in bs_res.select("thead th")] # columns ๋ฆฌ์ŠคํŠธ ๋งŒ๋“ค๊ธฐ
bs_tbody_tr = bs_res.select("tbody > tr")
ls_tbody = [[t.text for t in bs_tbody_tr_1] for bs_tbody_tr_1 in bs_tbody_tr]
df_tbl = pd.DataFrame(ls_tbody, columns = ls_thead) # val_page์˜ ๋ฐ์ดํ„ฐ ํฌ๋กค๋ง ์™„๋ฃŒ.
 
page scroll function
# scroll container ์ง€์ •
scroll_by = drv.find_element(By.CSS_SELECTOR, 'div#scroll_container') scroll_by.send_keys(Keys.End) # End ํ‚ค 
scroll_by.send_keys(Keys.PaheDn) # PageDn ํ‚ค

2. API

1) basic form

import requests
 
# ์˜ˆ์‹œ: ๊ตญํ† ๊ตํ†ต๋ถ€ ์•„ํŒŒํŠธ ์‹ค๊ฑฐ๋ž˜๊ฐ€api
service_key = 'your_key'
url = 'https://apis.data.go.kr/1613000/RTMSDataSvcAptTrade/getRTMSDataSvcAptTrade'
params = {
    "serviceKey": service_key,      # ํ•„์ˆ˜: ์„œ๋น„์Šค ์ธ์ฆํ‚ค
    "LAWD_CD":    "11110",          # ํ•„์ˆ˜: ํ–‰์ •๊ตฌ์—ญ ์ฝ”๋“œ
    "DEAL_YMD":   "202407",         # ํ•„์ˆ˜: ์กฐํšŒ๋…„์›” (YYYYMM)
    "pageNo":     "1",              # ์˜ต์…˜: ํŽ˜์ด์ง€ ๋ฒˆํ˜ธ
    "numOfRows":  "100"             # ์˜ต์…˜: ํ•œ ํŽ˜์ด์ง€ ๊ฒฐ๊ณผ ์ˆ˜
}

2) GCP geocoding

import googlemaps
 
# ์ฃผ์†Œ -> ์œ„/๊ฒฝ๋„ ์ขŒํ‘œ ๋ณ€ํ™˜
key = 'your_key'
addr = '์„œ์šธ์—ญ'
gmaps = googlemaps.Client(key=key)
res_cd = gmaps.geocode(addr)
lat = res_cd[0]['geometry']['location']['lat']
lng = res_cd[0]['geometry']['location']['lng']
 
def gcp_geocoding(addr): # ์‚ฌ์šฉ์ž ์ •์˜ํ•จ์ˆ˜
    gmaps = googlemaps.Client(key = key)
    res_cd = gmaps.geocode(addr)
    val_lat = res_cd[0]["geometry"]["location"]["lat"]
    val_lng = res_cd[0]["geometry"]["location"]["lng"]
    return [val_lat, val_lng]
df[['lat', 'lon']] = df['addr'].apply(gcp_geocoding).apply(pd.Series)
 
# ์œ„/๊ฒฝ๋„ -> ์ฃผ์†Œ ๋ณ€ํ™˜ (reverse_geocode)
key_geo = 'your_key'
gmaps = googlemaps.Client(key=key_geo)
def gcp_rev_geocoding(lat, lon):
    result = gmaps.reverse_geocode((latitude, longitude))
    dic_addr = result[0]['formatted_address']
    return dic_addr # ์ฃผ์†Œ ๋ฌธ์ž์—ด ๋ฐ˜ํ™˜

3) kakao map api

import request
 
# kakao api ํ˜ธ์ถœ
url = "https://dapi.kakao.com/v2/local/search/address.json" #์š”์ฒญํ•  url ์ฃผ์†Œ
Key = 'your_rest_api_key' #REST API ํ‚ค(์œ ํšจํ•œ ํ‚ค)
headers = {"Authorization": f"KakaoAK {key}"} 
addr = '์„œ์šธํŠน๋ณ„์‹œ ์ข…๋กœ๊ตฌ ์ฒญ์™€๋Œ€๋กœ 1'
 
result = requests.get(url, headers=headers,
                      params = {'query': addr}).json()
                      
region_1depth = result['documents'][0]['address']['region_1depth_name']
region_2depth = result['documents'][0]['address']['region_2depth_name']
lon = result['documents'][0]['address']['x']
lat = result['documents'][0]['address']['y']

III. Data Analysis

1. EDA

1) ์ƒ๊ด€๋ถ„์„

df = dataframe.corr() # ์ƒ๊ด€๊ด€๊ณ„ 
sns.heatmap(df, cmap = 'RdYlBu_r', # ํŒŒ~๋นจ 
			annot = True, # ์‹ค์ œ ๊ฐ’์„ ํ‘œ์‹œํ•œ๋‹ค 
			mask= df < 0.2, # ํ‘œ์‹œํ•˜์ง€ ์•Š์„ ๋งˆ์Šคํฌ ๋ถ€๋ถ„์„ ์ง€์ •ํ•œ๋‹ค 
			linewidths=.5, # ๊ฒฝ๊ณ„๋ฉด ์‹ค์„ ์œผ๋กœ ๊ตฌ๋ถ„ํ•˜๊ธฐ 
			cbar_kws={"shrink": .5},# ์ปฌ๋Ÿฌ๋ฐ” ํฌ๊ธฐ ์ ˆ๋ฐ˜์œผ๋กœ ์ค„์ด๊ธฐ 
			vmin = -1,vmax = 1 # ์ปฌ๋Ÿฌ๋ฐ” ๋ฒ”์œ„ -1 ~ 1 )

2. Text Mining

1) ๋นˆ๋„ ๋ถ„์„

text tokenizing
# ๋‹จ์–ด ํ† ํฐํ™”
ser_msg = df_msg.loc[, 'MSG_CN'] # ๋ฌธ์ž์—ด Series 
ser_msg = ser_msg.str.replace("\\(.*?\\)", "", regex=True)  # ๊ด„ํ˜ธ () ์•ˆ์˜ ๋ชจ๋“  ๋‚ด์šฉ์„ ์ œ๊ฑฐ. ์˜ˆ: "(์‚ฌ์ง„=๋‰ด์Šค1)" โ†’ ""
ser_msg = ser_msg.str.replace("\\{.*?\\}", "", regex=True)  # ์ค‘๊ด„ํ˜ธ {} ์•ˆ์˜ ๋ชจ๋“  ๋‚ด์šฉ์„ ์ œ๊ฑฐ
ser_msg = ser_msg.str.replace("\\[.*?\\]", "", regex=True)  # ๋Œ€๊ด„ํ˜ธ [] ์•ˆ์˜ ๋ชจ๋“  ๋‚ด์šฉ์„ ์ œ๊ฑฐ
ser_msg = ser_msg.str.replace("[^๊ฐ€-ํžฃA-Za-z0-9]", " ", regex=True)  # ํ•œ๊ธ€๊ณผ ์˜์–ด๋ฅผ ์ œ์™ธํ•œ ๋ชจ๋“  ๋ฌธ์ž(์ˆซ์ž, ํŠน์ˆ˜๋ฌธ์ž ๋“ฑ)๋ฅผ ๊ณต๋ฐฑ์œผ๋กœ ๋Œ€์ฒด
ser_msg = ser_msg.str.replace(" {2,}", " ", regex=True)  # ๋‘ ์นธ ์ด์ƒ์˜ ์—ฐ์†๋œ ๊ณต๋ฐฑ์„ ํ•˜๋‚˜์˜ ๊ณต๋ฐฑ์œผ๋กœ ์ถ•์†Œ
ser_msg = ser_msg.str.replace("^ | $", "", regex=True)  # ๋ฌธ์žฅ ๋งจ ์•ž ๋˜๋Š” ๋งจ ๋’ค์— ์žˆ๋Š” ๊ณต๋ฐฑ ์ œ๊ฑฐ
 
# ๋‹จ์–ด ๋นˆ๋„์ˆ˜ DataFrame
ser_msg = ser_msg.str.split(" ").explode()
ser_msg_cnt = ser_msg.value_counts()
ser_msg_cnt.index.name = "word"
df_msg_cnt = ser_msg_cnt.reset_index()
 
# ๋ถˆ์šฉ์–ด ์ œ๊ฑฐ
df_msg_cnt_cut = df_msg_cnt.loc[df_msg_cnt["word"].str.len() >= 2, ] # ํ—ˆ์šฉ ๋‹จ์–ด ๊ธธ์ด ์ง€์ •
val_regex = "(๋‹ค|๋Š”|๊ณ |์€|์„)$" # ๋ถˆ์šฉ์–ด(๋™์‚ฌ ๋“ฑ) ์ง€์ •
msg_cnt_cut = df_msg_cnt_cut.loc[~df_msg_cnt_cut["word"].str.contains(val_regex), ] # ๋ถˆ์šฉ์–ด ์ œ์™ธ
wordcloud
from wordcloud import WordCloud
 
dic_cmt_cnt = dict(zip(df_msg_cnt_cut["word"], df_msg_cnt_cut["count"]))
#mask = np.array(Image.open("image.png")) # wordcloud ๋งˆ์Šคํฌ ์ง€์ •
 
obj_wc = WordCloud(font_path = "C:/Windows/Fonts/malgun.ttf",
                   width = 400, height = 400,
                   max_font_size = 120,
                   min_font_size = 10,
                   prefer_horizontal=0.5,
                   mask = mask,
                   colormap='YlGnBu',
                   background_color = "#FFFFFF",
                   random_state = 123)
 
obj_wc = obj_wc.generate_from_frequencies(dic_cmt_cnt)
 
plt.figure(figsize=(16,8))
plt.imshow(obj_wc)
plt.axis("off")
plt.show()

2) ํ˜•ํƒœ์†Œ ๋ถ„์„ using Kiwi

Using 'Kiwi' tokenizer
from kiwipiepy import Kiwi
kiwi = Kiwi()
 
text = '๋‚˜ ๋‹น์žฅ ์ง‘์— ๊ฐ€๊ณ ์‹ถ์–ด'
tokens = kiwi.tokenize(text) # ํ˜•ํƒœ์†Œ ๋‚˜๋ˆ ์ง
 
# ์›Œ๋“œํด๋ผ์šฐ๋“œ ๋“ฑ ๋นˆ๋„๋ถ„์„์„ ์œ„ํ•œ ๋ช…์‚ฌ, ํ˜•์šฉ์‚ฌ ํ•„ํ„ฐ๋ง
def udf_kiwi_nv_tokenizer(text):
    if pd.notna(text):
        tokens = kiwi.tokenize(text)
        existed_text = []
        for val_token in tokens:
            if val_token.tag in ['NNG','NNP','VA']:
                existed_text.append(val_token.form)
        return existed_text
    else:
        return np.nan

3) ๊ฐ์„ฑ ๋ถ„์„ using HuggingFace

Using 'Korean-Sentiment' model
import os
from transformers import pipeline # hugging face
 
# hugging face api key
os.environ["HF_TOKEN"] = 'your key'
 
# Korean_sentiment ๊ฐ์„ฑ ๋ถ„์„ ๋ชจ๋ธ ๊ฐ€์ ธ์˜ค๊ธฐ ๋ฐ ์ž‘๋™ ํ™•์ธ
classifier = pipeline("text-classification", model="matthewburke/korean_sentiment")
custom_tweet = "์˜ํ™” ์žฌ๋ฐŒ๋‹ค."
preds = classifier(custom_tweet, return_all_scores=True)
is_positive = preds[0][1]['score'] > 0.5
print(is_positive)
 
# UDF ์ƒ์„ฑ
def review_PN_classifier(text):
  preds = classifier(text, return_all_scores=True)
  n_score = round(preds[0][0]['score'], 2)
  p_score = round(preds[0][1]['score'], 2)
  return n_score, p_score # df['text'].apply(review_PN_classifier).apply(pd.Series)

3. GIS ๋ถ„์„

1) Geopandas

import geopandas as gpd
 
df_geo = gpd.read_file('map_sido.shp', encoding='cp949')
ser_cent = df_geo['geometry'].centroid # return ์œ„/๊ฒฝ๋„ tuple
df_loc = pd.DataFrame(dict(loc = df_geo['CTP_KOR_NM'],
                            lon = ser_cent.x,
                            lat = ser_cent.y))

2) Folium

import folium
 
# ์˜ˆ์‹œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„ ์ƒ์„ฑ
df = pd.DataFrame(dict(
    lat=[37.5665, 37.5670, 37.5650],
    lon=[126.9780, 126.9790, 126.9770],
    col = ['green','red','blue'],
    icon=["image", "poo", "wand-magic-sparkles"]
))
# ์„œ์šธ ์ง€๋„ ์ƒ์„ฑ
m = folium.Map(location = [37.6, 126.8], tiles='cartodbpostron', attr='Google')
 
for idx, row in df.iterrows():
	folium.Marker(
		location=[row['lat'], row['lon'],
		icon=folium.Icon(icon=row['icon'], color=row['col'], prefix='fa')).add_to(m)