개발/ETC

네이버 뉴스 정보 가져오기

wonpick 2021. 5. 9. 19:48

첫번째 방법

import pandas as pd
csv_test = pd.read_csv('C:/Users/user/m_6.csv', encoding='UTF-8')
csv_test

from selenium import webdriver as wd
import time
import re
import csv
from selenium.common.exceptions import NoSuchElementException

csv_filename = "news_6월.csv"
csv_open = open(csv_filename, "w+", newline='', encoding='utf-8')
csv_writer = csv.writer(csv_open)
csv_writer.writerow( ('News_Date', 'News_Title', 'Content_Date', 'Content', 'Like', 'Dislike', ) )  #제목 입력


for n in range(0,10256):

    url = csv_test['Link'][n][:37]+'m_view=1&'+csv_test['Link'][n][37:]

    #웹 드라이버
    driver = wd.Chrome(executable_path="chromedriver.exe")
    driver.implicitly_wait(30)
    driver.get(url)

    try:  
        #네이버의 경우, 클린봇으로 추출이 안되는게 있다, 클린봇 옵션 해제 후 추출해주도록 한다.
        cleanbot = driver.find_element_by_css_selector('a.u_cbox_cleanbot_setbutton')
        cleanbot.click()
        time.sleep(1)
        cleanbot_disable = driver.find_element_by_css_selector("#cleanbot_dialog_checkbox_cbox_module")
        cleanbot_disable.click()
        time.sleep(1)
        cleanbot_confirm = driver.find_element_by_css_selector('body > div.u_cbox.u_cbox_layer_wrap > div > div.u_cbox_layer_cleanbot2 > div.u_cbox_layer_cleanbot2_extra > button')
        cleanbot_confirm.click()
        time.sleep(1)

        try:
            c_cnt = driver.find_element_by_css_selector('#cbox_module > div.u_cbox_wrap.u_cbox_ko.u_cbox_type_sort_favorite > div.u_cbox_head > a > span.u_cbox_count').text
        except:
            c_cnt = driver.find_element_by_css_selector('#cbox_module > div > div.u_cbox_head > a > span.u_cbox_count').text


        i_cnt = int(re.sub('[,]','',c_cnt))

        if i_cnt == 0 :
            driver.close()

        elif i_cnt <= 20 :
            time.sleep(3)
             #기사제목 추출
            article_head = driver.find_elements_by_css_selector('#articleTitle')
            #print("기사 제목 : " + article_head[0].text)

            #기사시간 추출
            article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
            #print("기사 등록 시간 : " + article_time[0].text)

            #댓글추출
            contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
            c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
            likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
            n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
            for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
                #print(cnt, " : ", content.text, c_date.text,like.text, n_like.text)
                csv_writer.writerow ( (article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text, ) )
            driver.close()

        else :
            #더보기 계속 클릭하기
            while True:
                try:
                    btn_more = driver.find_element_by_css_selector('a.u_cbox_btn_more')
                    btn_more.click()
                    # time.sleep(1)
                except:
                    break
            #기사제목 추출
            article_head = driver.find_elements_by_css_selector('#articleTitle')
            #print("기사 제목 : " + article_head[0].text)

            #기사시간 추출
            article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
            #print("기사 등록 시간 : " + article_time[0].text)

            #댓글추출
            contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
            c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
            likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
            n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
            for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
                #print(cnt, " : ", content.text, c_date.text,like.text, n_like.text)
                csv_writer.writerow ( (article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text) )
            driver.close()

    except NoSuchElementException:
        driver.close()


csv_open.close()

두번째 방법

import pandas as pd
from selenium import webdriver as wd
import time
import re
import csv
from selenium.common.exceptions import NoSuchElementException

start = time.time()

csv_test = pd.read_csv('C:/Users/User/3월.csv', encoding='UTF-8')
#빈 데이터프레임 만들기 day_article_comment
day_article_comment=pd.DataFrame(columns = ['News_Date', 'News_Title', 'Content_Date', 'Content', 'Like', 'Dislike'])
driver = wd.Chrome(executable_path="C://Users/User/Downloads/chromedriver_win32/chromedriver.exe")
driver.maximize_window()

for n in range(4336,11484):
    url = csv_test['Link'][n][:37]+'m_view=1&'+csv_test['Link'][n][37:]

    driver.get(url)
    driver.implicitly_wait(15)
    #print("성공-"+url)
    i_cnt = 0
    try:
        try:
            c_cnt = driver.find_element_by_css_selector('#cbox_module > div.u_cbox_wrap.u_cbox_ko.u_cbox_type_sort_favorite > div.u_cbox_head > a > span.u_cbox_count').text
            i_cnt = int(re.sub('[,]','',c_cnt))
        except:
            c_cnt = driver.find_element_by_css_selector('#cbox_module > div > div.u_cbox_head > a > span.u_cbox_count').text
    except:
        print("fucking 연예기사")

    if i_cnt == 0 :
        continue 

    #댓글 있는 경우

    else:
        #기사제목 추출
        article_head = driver.find_elements_by_css_selector('#articleTitle')
        article_head=article_head[0].text

        #기사시간 추출
        article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
        article_time = article_time[0].text[:11]

        try:  
            #네이버의 경우, 클린봇으로 추출이 안되는게 있다, 클린봇 옵션 해제 후 추출해주도록 한다.
            cleanbot = driver.find_element_by_css_selector('a.u_cbox_cleanbot_setbutton')
            try:
                cleanbot.click()
            except:
                print("클린봇 버튼 클릭 오류")
            driver.implicitly_wait(10)
            cleanbot_disable = driver.find_element_by_css_selector("#cleanbot_dialog_checkbox_cbox_module")
            cleanbot_disable.click()
            driver.implicitly_wait(10)
            cleanbot_confirm = driver.find_element_by_css_selector('body > div.u_cbox.u_cbox_layer_wrap > div > div.u_cbox_layer_cleanbot2 > div.u_cbox_layer_cleanbot2_extra > button')
            cleanbot_confirm.click()
            driver.implicitly_wait(10)

            if i_cnt <= 20 :
                #댓글추출
                contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
                c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
                likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
                n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')

            else :
                #더보기 계속 클릭하기
                scroll_cnt = 0
                while True:
                    try:
                        #10회 불러오기마다 잠깐 멈춤
                        if scroll_cnt%10==0:
                            time.sleep(1)
                        btn_more = driver.find_element_by_css_selector('a.u_cbox_btn_more')
                        btn_more.click()
                        driver.implicitly_wait(10)
                        scroll_cnt+=1
                    except:
                        #왜 오류가 나오는가?
                        print("더 이상 댓글 더 보기를 누를 수 없습니다 : "+url+"댓글 수 : "+str(i_cnt))
                        #driver.close()
                        #driver = wd.Chrome(executable_path="C://Users/User/Downloads/chromedriver_win32/chromedriver.exe")
                        #driver.get(url)
                        break
                #댓글추출
                contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
                c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
                likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
                n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')

                for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
                    contents=content.text
                    likes=like.text
                    n_likes=n_like.text
                    c_dates=c_date.text[:11]

                    day_article_comment=day_article_comment.append({'News_Date':article_time, 'News_Title':article_head,'Content_Date': c_dates,  'Content':contents, 'Like':likes, 'Dislike':n_likes}, ignore_index=True)
                    #article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text
                print(str(n)+"번째 글 불러오기 완료")

        except NoSuchElementException:
            continue
elapsed_time = time.time() - start 
print(elapsed_time)