BS사용해서 CSS 셀렉팅
import requests
from selenium import webdriver
from bs4 import BeautifulSoup as bs
import pandas as pd
import time
driver = webdriver.Chrome("C:/Users/user/anaconda3/chromedriver")
driver.implicitly_wait(3)
driver = webdriver.Chrome("C:/Users/user/anaconda3/chromedriver")
driver.implicitly_wait(3)
driver.get('https://m.search.naver.com/search.naver?where=m_article&sm=mtb_viw.all&query=%EC%BD%94%EB%A1%9C%EB%82%98&nso=')
driver.find_element_by_xpath('//*[@id="addParemt"]/li[1]/div[1]/a').click()
title = driver.find_elements_by_css_selector("#ct > div:nth-child(1) > div > h2")
print(title[0].text)
post = driver.find_elements_by_css_selector("#postContent")
print(post[0].text)
reply_num = driver.find_elements_by_css_selector('#app > div > div > div.ArticleContentWrap > div.footer_fix > div > a.f_reply > em')
print(reply_num[0].text)
'오류: Only the following pseudo-classes are implemented: nth-of-type.selenium
tr:nth ->nth-of-type 으로 바꿔준다'
driver.find_element_by_xpath('//*[@id="app"]/div/div/div[2]/div[2]/div/a[1]/em').click()
무한스크롤
#네이버 카페 게시물이 1000개 제한이므로 scroll down <100
pagedowns = 1
while pagedowns< 68: # 모바일은 한번에 게시물 15개, 웹은 10개
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
pagedowns +=1
'링크 추출하기'
# #링크 추출하기 ->하지만 네이버 카페에서는 쓰잘데기 없다.
html_source = driver.page_source
url = "https://m.search.naver.com/search.naver?where=m_article&sm=mtb_viw.all&query=%EC%BD%94%EB%A1%9C%EB%82%98&nso="
result = requests.get(url)
bs_obj = bs(html_source, "html.parser")
# If_items = bs_obj.find_all("div",{'class':'total_wrap'})
reply = bs_obj.select(".total_wrap > .total_sub > a") #.은 class.
for a in reply:
href = a.attrs['href']
print(href)
'개발' 카테고리의 다른 글
[SPARK] SPARK에서 대량의 스몰 파일이 생성되는 문제 해결 (0) | 2023.06.18 |
---|