import requests
import lxml.html
import pandas as pd
from pandas.io import sql
import os
import time
import datetime
뉴스 url 에 필요한 date list
date_index = pd.date_range(start='20210801', end='20210803')
date_list = date_index.strftime("%Y%m%d").tolist()
date_list
category = ['politics', 'economic', 'society', 'culture', 'foreign', 'digital']
id_list = ['10000', '10100', '10200', '10300', '10400', '10500']
category_id = {'politics' : '10000', 'economic' : '10100', 'society' : '10200', 'culture' : '10300', 'foreign' : '10400', 'digital' : '10500'}
뉴스 본문 크롤링
import re
import string
def get_detail(url):
body = []
punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·]'
reg = re.compile('[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+[a-zA-Z0-9-.]+$')
response = requests.get(url)
root = lxml.html.fromstring(response.content)
for p in root.xpath('//*[@id="harmonyContainer"]/section/p'):
if p.text: # 체크
temp = re.sub(punc, '', p.text)
temp = re.sub(reg, '', temp)
body.append(temp) # 특수문자 제거 / 메일주소 제거
full_body = ' '.join(body)
return full_body
뉴스 댓글 크롤링
'Authorization' : 에 해당하는 인증키는 수시로 복사해 넣어준다.
전체 댓글을 가져오려 했으나, 100개 이상은 되지 않았다,,
import requests
from bs4 import BeautifulSoup
def get_comment(news_id):
list_comment = []
url = 'https://comment.daum.net/apis/v1/posts/@{}/comments?'.format(news_id)
params = {'parentId' : '0', 'offset' : '0', 'limit' : '100', 'sort' : 'RECOMMEND', 'isInitial' : 'true'}
headers = {'Authorization' : 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTYzMTQ0OTE1OSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMmZhZDYzMzYtYzgyOC00Mzk3LWFhYzEtNjhjYzQ4YzUzMTdmIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.X9wnyiofD-33QI4WpEuapV-onY2siDWGJChcw_tkxwU'
}
response = requests.get(url, headers = headers, params = params)
status_code = response.status_code
comment_all = response.json()
#print(response.json())
#print(count_all['commentCount'])
for i in comment_all:
li = []
li.append(i['content'])
li.append(float(i['likeCount']))
li.append(float(i['dislikeCount']))
list_comment.append(li)
#print(li)
#print("----")
if len(list_comment) == 0:
list_comment.append('NA')
return list_comment
뉴스에 대한 emotion 크롤링
마찬가지로 'Authorization' : 에 해당하는 인증키는 수시로 복사해 넣어준다.
이부분은 팀원의 도움을 받았는데 댓글과 같은 방식으로 크롤링한다.
import requests
import json
def get_sentiment(news_id):
url = 'https://action.daum.net/apis/v1/reactions/home?itemKey={}'.format(news_id)
header = {
"User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"referer": url,
'Authorization' : "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTYzMTQ0OTE1OSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMmZhZDYzMzYtYzgyOC00Mzk3LWFhYzEtNjhjYzQ4YzUzMTdmIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.X9wnyiofD-33QI4WpEuapV-onY2siDWGJChcw_tkxwU"
}
raw = requests.get(url, headers=header)
s_jsonData = json.loads(raw.text)
s_jsonData
sentiment = {"좋아요" : 0, "감동이에요" : 0, "슬퍼요" : 0, "화나요" : 0, "추천해요" : 0}
sentiment['좋아요'] = s_jsonData['item']['stats']['LIKE']
sentiment['감동이에요'] = s_jsonData['item']['stats']['IMPRESS']
sentiment['슬퍼요'] = s_jsonData['item']['stats']['SAD']
sentiment['화나요'] = s_jsonData['item']['stats']['ANGRY']
sentiment['추천해요'] = s_jsonData['item']['stats']['RECOMMEND']
return sentiment
크롤링, 전체 데이터프레임을 pickle로 저장
import pickle
df_list = []
id_count = 0
for REG_DATE in date_list:
page = 1
max_page = 0
while(True):
df_temp = []
id_temp = 1
response = requests.get('http://news.daum.net/breakingnews/{}?page={}®Date={}'\
.format(category[5], page, REG_DATE))
root = lxml.html.fromstring(response.content)
for li in root.xpath('//*[@id="mArticle"]/div[3]/ul/li'):
id_num = id_count + 15 * (page - 1) + id_temp
id_temp = id_temp + 1
id_str = str(id_num)
id_fin = id_list[5]+"_"+id_str
date = datetime.datetime.strptime(REG_DATE, '%Y%m%d')
a = li.xpath('div/strong/a')[0]
url = a.get('href')
content = get_detail(url)
content_len = float(len(content))
news_id = url.split("/")[-1]
comment = get_comment(news_id)
comment_cnt = float(len(comment))
if comment[0] == 'NA':
comment_cnt = 0.0
time = li.xpath('div/strong/span/span[2]')[0]
sentiment = get_sentiment(news_id)
press = li.xpath('div/strong/span')[0]
df = pd.DataFrame({'ID' : [id_fin], 'Date' : [date], 'Title':[a.text], 'Content' : [content], 'Content_len' : [content_len],
'Comment' : [comment], 'Comment_cnt' : [comment_cnt], 'Time' : [time.text], 'Sentiment' : [sentiment],
'url' : [url], 'Press' : [press.text]})
df_temp.append(df)
if df_temp:
for i in range(len(df_temp)):
df_list.append(df_temp[i])
# 페이지 번호 중에서 max 페이지 가져오기
for a in root.xpath('//*[@id="mArticle"]/div[3]/div/span/a'):
try:
num = int(a.text)
if max_page < num:
max_page = num
except:
pass
# 마지막 페이지 여부 확인
span = root.xpath('//*[@id="mArticle"]/div[3]/div/span/a[@class="btn_page btn_next"]')
# check
print(page)
if (len(span) <= 0) & (page >= max_page):
break
else:
page = page + 1
#time.sleep(1)
id_count = len(df_list)
# check
print("======"+REG_DATE+"=========")
df_list_all = pd.concat(df_list)
with open('result/'+id_list[5]+'_2108.pickle', 'wb') as f:
pickle.dump(df_list_all, f)
데이터 불러오기
with open('result/10500_2108.pickle', 'rb') as f:
load_data = pickle.load(f)
참고 :
https://jione-e.tistory.com/28
'Python' 카테고리의 다른 글
[python] 여러개의 데이터프레임 합치기 (concat multiple dataframes in Python) (1) | 2021.10.07 |
---|---|
[python] dictionary list에서 key에 따른 value들 더하기 (0) | 2021.10.06 |
[python] 문자열 공백 제거 replace 함수 사용하기 (0) | 2021.10.05 |
[python] 파이썬 딕셔너리 안에서 특정 키만 가져오기 (0) | 2021.09.12 |
[python] random 모듈 함수 (0) | 2021.09.06 |
댓글