본문 바로가기
Python

[python] 다음 뉴스 크롤링 (python crawling)

by clolee 2021. 9. 12.
import requests
import lxml.html
import pandas as pd
from pandas.io import sql
import os
import time
import datetime

 

뉴스 url 에 필요한 date list

date_index = pd.date_range(start='20210801', end='20210803')
date_list = date_index.strftime("%Y%m%d").tolist()
date_list

category = ['politics', 'economic', 'society',  'culture', 'foreign', 'digital']
id_list = ['10000', '10100', '10200', '10300', '10400', '10500']
category_id = {'politics' : '10000', 'economic' : '10100', 'society' : '10200', 'culture' : '10300', 'foreign' : '10400', 'digital' : '10500'}

 

뉴스 본문 크롤링

import re
import string

def get_detail(url):
    body = []
    punc = '[!"#$%&\'()*+,-./:;<=>?[\]^_`{|}~“”·]'
    reg = re.compile('[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+[a-zA-Z0-9-.]+$')
    response = requests.get(url)
    root = lxml.html.fromstring(response.content)
    for p in root.xpath('//*[@id="harmonyContainer"]/section/p'):
        if p.text: # 체크
            temp = re.sub(punc, '', p.text)
            temp = re.sub(reg, '', temp)
            body.append(temp) # 특수문자 제거 / 메일주소 제거
    full_body = ' '.join(body)
    
    return full_body

 

뉴스 댓글 크롤링

'Authorization' : 에 해당하는 인증키는 수시로 복사해 넣어준다.

 

전체 댓글을 가져오려 했으나, 100개 이상은 되지 않았다,,

import requests
from bs4 import BeautifulSoup

def get_comment(news_id):
    list_comment = []
    url = 'https://comment.daum.net/apis/v1/posts/@{}/comments?'.format(news_id)
    params = {'parentId' : '0', 'offset' : '0', 'limit' : '100', 'sort' : 'RECOMMEND', 'isInitial' : 'true'}
    headers = {'Authorization' : 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTYzMTQ0OTE1OSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMmZhZDYzMzYtYzgyOC00Mzk3LWFhYzEtNjhjYzQ4YzUzMTdmIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.X9wnyiofD-33QI4WpEuapV-onY2siDWGJChcw_tkxwU'
    }
    response = requests.get(url, headers = headers, params = params)
    status_code = response.status_code
    comment_all = response.json()
    #print(response.json())
    #print(count_all['commentCount'])
    for i in comment_all:
        li = []
        li.append(i['content'])
        li.append(float(i['likeCount']))
        li.append(float(i['dislikeCount']))
        list_comment.append(li)
        #print(li)
        #print("----")
    
    if len(list_comment) == 0:
        list_comment.append('NA')

    return list_comment

 

뉴스에 대한 emotion 크롤링

마찬가지로 'Authorization' : 에 해당하는 인증키는 수시로 복사해 넣어준다.

 

이부분은 팀원의 도움을 받았는데 댓글과 같은 방식으로 크롤링한다. 

import requests
import json

def get_sentiment(news_id):
    
    url = 'https://action.daum.net/apis/v1/reactions/home?itemKey={}'.format(news_id)
    header = {
        "User-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
        "referer": url,
        'Authorization' : "Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJmb3J1bV9rZXkiOiJuZXdzIiwiZ3JhbnRfdHlwZSI6ImFsZXhfY3JlZGVudGlhbHMiLCJzY29wZSI6W10sImV4cCI6MTYzMTQ0OTE1OSwiYXV0aG9yaXRpZXMiOlsiUk9MRV9DTElFTlQiXSwianRpIjoiMmZhZDYzMzYtYzgyOC00Mzk3LWFhYzEtNjhjYzQ4YzUzMTdmIiwiZm9ydW1faWQiOi05OSwiY2xpZW50X2lkIjoiMjZCWEF2S255NVdGNVowOWxyNWs3N1k4In0.X9wnyiofD-33QI4WpEuapV-onY2siDWGJChcw_tkxwU"
    }
    raw = requests.get(url, headers=header)

    s_jsonData = json.loads(raw.text)
    s_jsonData

    sentiment = {"좋아요" : 0, "감동이에요" : 0, "슬퍼요" : 0, "화나요" : 0, "추천해요" : 0}

    sentiment['좋아요'] = s_jsonData['item']['stats']['LIKE']
    sentiment['감동이에요'] = s_jsonData['item']['stats']['IMPRESS']
    sentiment['슬퍼요'] = s_jsonData['item']['stats']['SAD']
    sentiment['화나요'] = s_jsonData['item']['stats']['ANGRY']
    sentiment['추천해요'] = s_jsonData['item']['stats']['RECOMMEND']
    
    return sentiment

 

크롤링,  전체 데이터프레임을 pickle로 저장

import pickle

df_list = []
id_count = 0
for REG_DATE in date_list:
    page = 1
    max_page = 0
    while(True):
        df_temp = []
        id_temp = 1
        response = requests.get('http://news.daum.net/breakingnews/{}?page={}&regDate={}'\
                                .format(category[5], page, REG_DATE))
        root = lxml.html.fromstring(response.content)
        for li in root.xpath('//*[@id="mArticle"]/div[3]/ul/li'):
            id_num = id_count + 15 * (page - 1) + id_temp
            id_temp = id_temp + 1
            id_str = str(id_num)
            id_fin = id_list[5]+"_"+id_str
            date = datetime.datetime.strptime(REG_DATE, '%Y%m%d')
            a = li.xpath('div/strong/a')[0]
            url = a.get('href')
            content = get_detail(url)
            content_len = float(len(content))
            
            news_id = url.split("/")[-1]
            comment = get_comment(news_id)
            comment_cnt = float(len(comment))
            if comment[0] == 'NA':
                comment_cnt = 0.0
                
            time = li.xpath('div/strong/span/span[2]')[0]
            sentiment = get_sentiment(news_id)
            press = li.xpath('div/strong/span')[0]
            df = pd.DataFrame({'ID' : [id_fin], 'Date' : [date], 'Title':[a.text], 'Content' : [content], 'Content_len' : [content_len], 
                               'Comment' : [comment], 'Comment_cnt' : [comment_cnt], 'Time' : [time.text], 'Sentiment' : [sentiment],
                               'url' : [url], 'Press' : [press.text]})
            df_temp.append(df)   
    
        if df_temp:
            for i in range(len(df_temp)):
                df_list.append(df_temp[i])
            
        # 페이지 번호 중에서 max 페이지 가져오기    
        for a in root.xpath('//*[@id="mArticle"]/div[3]/div/span/a'):
            try:
                num = int(a.text)
                if max_page < num:
                    max_page = num       
            except:
                pass

        # 마지막 페이지 여부 확인     
        span = root.xpath('//*[@id="mArticle"]/div[3]/div/span/a[@class="btn_page btn_next"]')
        # check
        print(page)
        if (len(span) <= 0) & (page >= max_page):
            break
        else:
            page = page + 1
        
        #time.sleep(1) 
    id_count = len(df_list)
    # check
    print("======"+REG_DATE+"=========")

df_list_all = pd.concat(df_list)

with open('result/'+id_list[5]+'_2108.pickle', 'wb') as f:
     pickle.dump(df_list_all, f)

 

데이터 불러오기

with open('result/10500_2108.pickle', 'rb') as f:
    load_data = pickle.load(f)

 

 

 

참고 :

https://jione-e.tistory.com/28

https://jvvp.tistory.com/1146?category=881259

https://teddylee777.github.io/python/pickle%EB%A1%9C-%EB%8D%B0%EC%9D%B4%ED%84%B0-dataframe-%EC%A0%80%EC%9E%A5%ED%95%98%EA%B3%A0-%EB%B6%88%EB%9F%AC%EC%98%A4%EA%B8%B0

댓글