728x90
반응형
In [51]:
headers = {'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537'}
def getDownload(url, param = None, retries = 3):
resp = None
try:
resp = requests.get(url, params = param, headers = headers)
resp.raise_for_status()
except requests.exceptions.HTTPError as e:
if 500 <= resp.status_code < 600 and retries > 0:
print('Retries : {0}'.format(retries))
return getDownload(url, param, retries -1)
else:
print(resp.status_code)
print(resp.reason)
print(resp.request.headers)
return resp
In [52]:
import requests
In [62]:
import requests
url = "https://search.daum.net/nate"
param = {
"thr":"sbma",
"w":"tot",
"q":"%ED%8C%8C%EC%9D%B4%EC%8D%AC"}
html = getDownload(url,param)
dom = BeautifulSoup(html.content,"lxml")
In [179]:
for tag in dom.select('div#blogColl a.wrap_tit + span'):
print(tag.text)
if tag.has_attr('href'):
print(tag['href'])
In [72]:
url ='https://www.google.com/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&oq=%ED%8C%8C%EC%9D%B4%EC%8D%AC&aqs=chrome..69i57j35i39j69i60j69i65j69i60l2.1587j0j4&sourceid=chrome&ie=UTF-8'
html = getDownload(url,{})
In [73]:
dom = BeautifulSoup(html.text,'lxml')
In [74]:
len(dom.select(' .r > a'))
Out[74]:
In [75]:
for tag in dom.select(".r > a > h3"):
print(tag.text)
print(tag.find_parent()['href'])
In [85]:
url = 'https://search.naver.com/search.naver?sm=top_hty&fbm=1&ie=utf8&query=%ED%8C%8C%EC%9D%B4%EC%8D%AC'
html = getDownload(url)
dom = BeautifulSoup(html.text,'lxml')
In [95]:
len(dom.select(' .blog dt > a '))
Out[95]:
In [96]:
for tag in dom.select(" .blog dt > a "):
print(tag.text)
print(tag['href'])
In [122]:
seed = 'http://example.webscraping.com/places/default/index'
html = getDownload(seed)
dom = BeautifulSoup(html.text,'lxml')
In [141]:
# dom 확인
In [124]:
# 첫 페이지에 링크가 16개 있음
len(dom.select('a'))
Out[124]:
In [125]:
from urllib.parse import urljoin
requests.compat.urljoin(seed,'/search')
# urljoin(seed, "/search")
Out[125]:
In [129]:
# 첫 페이지인 경우 seed랑 같은게 있으면 안됨
unseen = []
for tag in dom.select('a'):
if tag.has_attr('href'):
href = tag['href']
if href.startswith('http'): # HTTP(S)
print("External : {0}".format(href))
elif href.startswith("/"): #
newSeed = requests.compat.urljoin(seed, href)
if seed != newSeed:
unseen.append(newSeed)
else:
print("Skipped: {0}".format(href))
# 오류 나지 않을 때 분석
# 모두 내부링크
In [130]:
unseen
Out[130]:
In [150]:
# 함수화 하기
def getUrls(base):
# base 주소에 request 주고 response -> a태그 추출 -> href 정구화
# -> DB List를 통해 관리
unseen = []
#request 보내야해서 getDownload
html = getDownload(base)
dom = BeautifulSoup(html.text,'lxml')
for tag in dom.select('a'):
if tag.has_attr('href'):
href = tag['href']
if href.startswith('http'): # HTTP(S)
#print("External : {0}".format(href))
unseen.append(href)
elif href.startswith("/"): #
newSeed = requests.compat.urljoin(seed, href)
if seed != newSeed:
unseen.append(newSeed)
# else:
# print("Skipped: {0}".format(href))
print("{0} -> {1}".format(base,len(unseen)))
return unseen
# 오류 나지 않을 때 분석
# 모두 내부링크
In [146]:
queue = getUrls(seed)
seen = []
while queue:
seed = queue.pop(0) # 마지막 원소가 꺼내지고 삭제 보통, 여기선 1번째로
time.sleep(random.randint(1,3))
unseen = getUrls(seed)
seen.append(seed)
# 딜레이 : 사람이 하는 것 처럼 하게 함
# 앞으로 진행할, 봤던 링크에 있으면 안된다.
print("Q : {0}, Unseen : {1}".format(len(queue),len(unseen)))
queue.extend([link for link in unseen if link not in seen and queue])
# 선입선출, 주소를 하나씩 탐색
# 대략 500개 정도 늘어났다가 줄어듬
In [137]:
# 딜레이
import time
import random
time.sleep(random.randint(1,3)) # second 만큼 delay 검
In [155]:
# append 말고 extend로 붙이기
# 원소만 가져와서 붙여짐
In [ ]:
queue = ['https://www.google.com/search?q=%ED%8C%8C%EC%9D%B4%EC%8D%AC&oq=%ED%8C%8C%EC%9D%B4%EC%8D%AC&aqs=chrome..69i57j69i60j35i39j0l3.1994j0j8&sourceid=chrome&ie=UTF-8']
while queue:
base = queue.pop(0)
links = getUrls(base)
queue.extend(links)
In [170]:
# 첫 페이지 링크 수집
# 가장 먼저 구동
url = 'https://www.google.com/search'
param = {'q':'파이썬'}
queue =[]
html = getDownload(url,param)
dom = BeautifulSoup(html.text,'lxml')
# . 은 클래스
for tag in dom.select(".r > a > h3"):
print(tag.text)
print(tag.find_parent()['href'])
queue.append({"url" : tag.find_parent()['href'], "depth" : 0})
In [171]:
# 최종
def getUrls(link, depth=3):
if depth > 3 :
return None
# for link in queue:
links = []
html = getDownload(link)
dom = BeautifulSoup(html.text,'lxml')
for a in dom.select('a'):
if a.has_attr('href'): # 있는지 확인
if a['href'].startswith('http'):
links.append({"url":a['href'],"depth":depth+1})
elif a['href'].startswith('/') and len(a['href']) > 1:
links.append({"url":requests.compat.urljoin(link,a['href']), 'depth':depth+1})
# else:
# print("Skipped : {0}".format(a['href']))
print("{0} {1} : {2}".format(">"*depth, link, len(links)))
return links
# , / (만 있는거 ), javascript시작하는 것들 다 걸러야함
# /(.+) , http(s)뭐라도 있는 애들 살려야함
# 링크안의 링크 수를 나타냄
In [174]:
depth = 0
while queue:
link = queue.pop(0)
links = getUrls(link['url'], link['depth'])
if links != None:
queue.extend(links)
# 실행시 depth, seed, 안의 url 수가 출력된다.
In [178]:
# pop 참고
a = [1,2,3,4]
b = a.pop(0)
print(a, b)
728x90
반응형