개발/ETC
네이버 뉴스 정보 가져오기
wonpick
2021. 5. 9. 19:48
첫번째 방법
import pandas as pd
csv_test = pd.read_csv('C:/Users/user/m_6.csv', encoding='UTF-8')
csv_test
from selenium import webdriver as wd
import time
import re
import csv
from selenium.common.exceptions import NoSuchElementException
csv_filename = "news_6월.csv"
csv_open = open(csv_filename, "w+", newline='', encoding='utf-8')
csv_writer = csv.writer(csv_open)
csv_writer.writerow( ('News_Date', 'News_Title', 'Content_Date', 'Content', 'Like', 'Dislike', ) ) #제목 입력
for n in range(0,10256):
url = csv_test['Link'][n][:37]+'m_view=1&'+csv_test['Link'][n][37:]
#웹 드라이버
driver = wd.Chrome(executable_path="chromedriver.exe")
driver.implicitly_wait(30)
driver.get(url)
try:
#네이버의 경우, 클린봇으로 추출이 안되는게 있다, 클린봇 옵션 해제 후 추출해주도록 한다.
cleanbot = driver.find_element_by_css_selector('a.u_cbox_cleanbot_setbutton')
cleanbot.click()
time.sleep(1)
cleanbot_disable = driver.find_element_by_css_selector("#cleanbot_dialog_checkbox_cbox_module")
cleanbot_disable.click()
time.sleep(1)
cleanbot_confirm = driver.find_element_by_css_selector('body > div.u_cbox.u_cbox_layer_wrap > div > div.u_cbox_layer_cleanbot2 > div.u_cbox_layer_cleanbot2_extra > button')
cleanbot_confirm.click()
time.sleep(1)
try:
c_cnt = driver.find_element_by_css_selector('#cbox_module > div.u_cbox_wrap.u_cbox_ko.u_cbox_type_sort_favorite > div.u_cbox_head > a > span.u_cbox_count').text
except:
c_cnt = driver.find_element_by_css_selector('#cbox_module > div > div.u_cbox_head > a > span.u_cbox_count').text
i_cnt = int(re.sub('[,]','',c_cnt))
if i_cnt == 0 :
driver.close()
elif i_cnt <= 20 :
time.sleep(3)
#기사제목 추출
article_head = driver.find_elements_by_css_selector('#articleTitle')
#print("기사 제목 : " + article_head[0].text)
#기사시간 추출
article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
#print("기사 등록 시간 : " + article_time[0].text)
#댓글추출
contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
#print(cnt, " : ", content.text, c_date.text,like.text, n_like.text)
csv_writer.writerow ( (article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text, ) )
driver.close()
else :
#더보기 계속 클릭하기
while True:
try:
btn_more = driver.find_element_by_css_selector('a.u_cbox_btn_more')
btn_more.click()
# time.sleep(1)
except:
break
#기사제목 추출
article_head = driver.find_elements_by_css_selector('#articleTitle')
#print("기사 제목 : " + article_head[0].text)
#기사시간 추출
article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
#print("기사 등록 시간 : " + article_time[0].text)
#댓글추출
contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
#print(cnt, " : ", content.text, c_date.text,like.text, n_like.text)
csv_writer.writerow ( (article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text) )
driver.close()
except NoSuchElementException:
driver.close()
csv_open.close()
두번째 방법
import pandas as pd
from selenium import webdriver as wd
import time
import re
import csv
from selenium.common.exceptions import NoSuchElementException
start = time.time()
csv_test = pd.read_csv('C:/Users/User/3월.csv', encoding='UTF-8')
#빈 데이터프레임 만들기 day_article_comment
day_article_comment=pd.DataFrame(columns = ['News_Date', 'News_Title', 'Content_Date', 'Content', 'Like', 'Dislike'])
driver = wd.Chrome(executable_path="C://Users/User/Downloads/chromedriver_win32/chromedriver.exe")
driver.maximize_window()
for n in range(4336,11484):
url = csv_test['Link'][n][:37]+'m_view=1&'+csv_test['Link'][n][37:]
driver.get(url)
driver.implicitly_wait(15)
#print("성공-"+url)
i_cnt = 0
try:
try:
c_cnt = driver.find_element_by_css_selector('#cbox_module > div.u_cbox_wrap.u_cbox_ko.u_cbox_type_sort_favorite > div.u_cbox_head > a > span.u_cbox_count').text
i_cnt = int(re.sub('[,]','',c_cnt))
except:
c_cnt = driver.find_element_by_css_selector('#cbox_module > div > div.u_cbox_head > a > span.u_cbox_count').text
except:
print("fucking 연예기사")
if i_cnt == 0 :
continue
#댓글 있는 경우
else:
#기사제목 추출
article_head = driver.find_elements_by_css_selector('#articleTitle')
article_head=article_head[0].text
#기사시간 추출
article_time = driver.find_elements_by_css_selector('div.sponsor > span.t11')
article_time = article_time[0].text[:11]
try:
#네이버의 경우, 클린봇으로 추출이 안되는게 있다, 클린봇 옵션 해제 후 추출해주도록 한다.
cleanbot = driver.find_element_by_css_selector('a.u_cbox_cleanbot_setbutton')
try:
cleanbot.click()
except:
print("클린봇 버튼 클릭 오류")
driver.implicitly_wait(10)
cleanbot_disable = driver.find_element_by_css_selector("#cleanbot_dialog_checkbox_cbox_module")
cleanbot_disable.click()
driver.implicitly_wait(10)
cleanbot_confirm = driver.find_element_by_css_selector('body > div.u_cbox.u_cbox_layer_wrap > div > div.u_cbox_layer_cleanbot2 > div.u_cbox_layer_cleanbot2_extra > button')
cleanbot_confirm.click()
driver.implicitly_wait(10)
if i_cnt <= 20 :
#댓글추출
contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
else :
#더보기 계속 클릭하기
scroll_cnt = 0
while True:
try:
#10회 불러오기마다 잠깐 멈춤
if scroll_cnt%10==0:
time.sleep(1)
btn_more = driver.find_element_by_css_selector('a.u_cbox_btn_more')
btn_more.click()
driver.implicitly_wait(10)
scroll_cnt+=1
except:
#왜 오류가 나오는가?
print("더 이상 댓글 더 보기를 누를 수 없습니다 : "+url+"댓글 수 : "+str(i_cnt))
#driver.close()
#driver = wd.Chrome(executable_path="C://Users/User/Downloads/chromedriver_win32/chromedriver.exe")
#driver.get(url)
break
#댓글추출
contents = driver.find_elements_by_css_selector('span.u_cbox_contents')
c_dates = driver.find_elements_by_css_selector('span.u_cbox_date')
likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_recomm')
n_likes = driver.find_elements_by_css_selector('em.u_cbox_cnt_unrecomm')
for content, c_date, like, n_like in zip(contents, c_dates, likes, n_likes):
contents=content.text
likes=like.text
n_likes=n_like.text
c_dates=c_date.text[:11]
day_article_comment=day_article_comment.append({'News_Date':article_time, 'News_Title':article_head,'Content_Date': c_dates, 'Content':contents, 'Like':likes, 'Dislike':n_likes}, ignore_index=True)
#article_time[0].text[:11], article_head[0].text, c_date.text[:11], content.text, like.text, n_like.text
print(str(n)+"번째 글 불러오기 완료")
except NoSuchElementException:
continue
elapsed_time = time.time() - start
print(elapsed_time)