- 取得連結
- X
- 以電子郵件傳送
- 其他應用程式
程式語言:Python
Package:pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學
功能:jquery-like 分析 html
jQuery Selectors
Python爬虫利器六之PyQuery的用法
Package:pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學
功能:jquery-like 分析 html
jQuery Selectors
from pyquery import PyQuery dom = PyQuery(url="https://www.google.com.tw/") links = dom("a") data = {} for a in links.items(): title = a.text().strip() data[title] = a.attr['href']
讀取網頁
from pyquery import PyQuery # change parser dom = PyQuery("<html></html>", parser='xml') dom = PyQuery("<html></html>", parser='html') dom = PyQuery("<html></html>", parser='html_fragments')
# by string dom = PyQuery("<html></html>")
# by etree from lxml import etree dom = PyQuery(etree.fromstring("<html></html>"))
# by url # 支援的 method,取決於是否有裝 requests # 沒裝的話為 urllib2 dom = PyQuery(url='https://www.google.com.tw/') dom = PyQuery('https://www.google.com.tw/') # add cookies dom = PyQuery('https://www.google.com.tw/', cookies=dict(over18='1')) # add headers dom = PyQuery('https://www.google.com.tw/', headers={'User-Agent': 'I am not a robot!'})
# by opener from selenium.webdriver import Firefox def selenium_opener(url): driver = Firefox() driver.get(url) html = driver.page_source driver.quit() return html dom = PyQuery('https://www.google.com.tw/', opener=selenium_opener)
# by file dom = PyQuery(filename=path_to_html_file)
使用範例
html 如下from pyquery import PyQuery html=''' <html> <head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p>''' dom = PyQuery(html)
訪問方法
幾乎等同 jQuery,可參考 [jQuery] 基本架構
可將 dom 視為 $,可理解到 pyquery & jQuery 差異極小
dom('a') # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>] dom('a.sister#link1') # [<a#link1.sister>]
# 並不會是 PyQuery 物件,需注意 type(dom('a')[0]) # <class 'lxml.html.HtmlElement'> # 四種解法 # 再用其他 attribute 選擇 dom('a')('#link1') # 利用 items() 再轉為 list list(dom('a').items())[0] # 利用 PyQuery 再轉回來 PyQuery(dom('a')[0]) # 利用 eq dom('a').eq(0)
搜尋方法
dom('p').children() # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>] dom('p').children('#link1') # [<a#link1.sister>]
dom('p.title').siblings() # [<p.story>, <p.story>] dom('a#link2').siblings() # [<a#link1.sister>, <a#link3.sister>] dom('a#link2').siblings('#link1') # [<a#link1.sister>]
# 除 children 外,包含 text nodes # element 為 <class 'lxml.html.HtmlElement'> dom('body').contents() # ['\n', <Element p at 0x3d211d8>, '\n\n', <Element p at 0x3d21228>, '\n\n', <Element p at 0x3d1c4f8>]
# 含自己往上找,找到最接近的 parents dom('a').closest() # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>] dom('a').closest('p') # [<p.story>, <p.story>, <p.story>]
# 找到其 parent,只往上查找一層,會自動移除重覆的 dom('a').parent() # [<p.story>] dom('a').parent('body') # []
# 找到其 parents,往上查找不停止,找出所有符合條件的,會自動移除重覆的 dom('a').parents() # [<html>, <body>, <p.story>] dom('a').parents('body') # [<body>]
# 選擇第一個 dom('a').eq(0) # [<a#link1.sister>]
# 可為 function,或是平常的 string dom('p').filter(lambda i: PyQuery(this).text() == '...') # [<p.story>]
dom('p').find('*') # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
list(dom('a').items()) # [[<a#link1.sister>], [<a#link2.sister>], [<a#link3.sister>]] list(dom('a').items('#link1')) # [[<a#link1.sister>]]
dom('a').not_('#link1') # [<a#link2.sister>, <a#link3.sister>]
dom('a#link2').next() # [<a#link3.sister>] dom('a#link2').next('a') # [<a#link3.sister>]
dom('a#link2').prev() # [<a#link1.sister>] dom('a#link2').prev('a') # [<a#link1.sister>]
dom('a#link1').next_all() dom('a#link1').nextAll() # [<a#link2.sister>, <a#link3.sister>] dom('a#link1').next_all('#link3') dom('a#link1').nextAll('#link3') # [<a#link3.sister>]
dom('a#link3').prev_all() dom('a#link3').prevAll() # [<a#link1.sister>, <a#link2.sister>] dom('a#link3').prev_all('#link1') dom('a#link3').prevAll('#link1') # [<a#link1.sister>]
# 回傳上一個 traversal dom('a').parent().end()
Attributes
# 會以第一個 <a> 為主 dom('a').attr('id') dom('a').attr.id dom('a').attr['id'] # 'link1' dom('a').attr('class_') dom('a').attr.class_ dom('a').attr['class_'] # 'sister'
# 只要其中之一有就是 True dom('p').has_class('story') dom('p').hasClass('story') # True
# 以第一個為主,物件高度,沒有會回傳 None dom('p').height() # None
# 以第一個為主,物件寬度,沒有會回傳 None dom('p').width() # None
# 回傳所有元件的 text dom('title').text() # "The Dormouse's story"
# 以第一個為主,回傳 value 值,沒有會回傳 None dom('p').val() # None
Properties
# 兩者意義一樣 dom('a').length # 3 dom('a').size() # 3
# 若無網址則回傳 None dom.base_url # None domT = PyQuery('https://www.google.com.tw/') domT.base_url # 'https://www.google.com.tw/'
# 只適用有 root 的,不然會有錯誤,像是最上層的 dom dom.root # <lxml.etree._ElementTree object at 0x0000000003D23648>
# 只適用有 root 的,不然會有錯誤,像是最上層的 dom dom.encoding # 'ISO-8859-1'
網頁內容
dom('head').html() # "<title>The Dormouse's story</title>"
dom('head').outer_html() dom('head').outerHtml() # "<head><title>The Dormouse's story</title></head>"
更改方法
更改會影響原始內容,且一次改所有的 element# 複製原本的內容並回傳,可用在不想被更改的元件上 dom('.title').clone()
# 會將原本的替換掉 dom('a').attr('.class_') = "classA" dom('a').attr.class_ = "classA" dom('a').attr['class_'] = "classA" # [<a#link1.classA>, <a#link2.classA>, <a#link3.classA>] dom('a#link2').attr('id') = "link4" dom('a#link2').attr.id = "link4" dom('a#link2').attr['id'] = "link4" # [<a#link1.sister>, <a#link4.sister>, <a#link3.sister>]
# 保留原本的 dom('a').add_class('classB') dom('a').addClass('classB') # [<a#link1.sister.classB>, <a#link2.sister.classB>, <a#link3.sister.classB>]
dom('p').remove_class('story') dom('p').removeClass('story') # [<p.title>, <p>, <p>]
dom('a').toggle_class('brother').toggle_class('sister') dom('a').toggleClass('brother').toggleClass('sister') # [<a#link1.brother>, <a#link2.brother>, <a#link3.brother>]
dom('a').val('123') # <a href="http://example.com/elsie" class="sister" id="link1" value="123">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2" value="123">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3" value="123">Tillie</a>;
dom('a').remove_attr('class') dom('a').removeAttr('class') # [<a#link1>, <a#link2>, <a#link3>]
dom('.title').text('123') # 原來的 <b> 被移除了 # <p class="title">123</p>
dom('a').css("font-size", "15px") # <a href="http://example.com/elsie" class="sister" id="link1" style="font-size: 15px">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2" style="font-size: 15px">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3" style="font-size: 15px">Tillie</a>; # and they lived at the bottom of a well.
doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>') doc # [<{http://example.com/foo}foo>] doc.remove_namespaces() # [<foo>]
dom('.title').before("<div>before</div>") # <div>before</div><p class="title"><b>The Dormouse's story</b></p>
dom('.title').after("<div>after</div>") # <p class="title"><b>The Dormouse's story</b></p> # <div>after</div>
dom('.title').prepend("<div>prepend</div>") # <p class="title"><div>prepend</div><b>The Dormouse's story</b></p>
dom('.title').append("<div>append</div>") # <p class="title"><b>The Dormouse's story</b><div>append</div></p>
# 從原本的位罝移到其他元件裡面前面,且放到所有的元件 dom('.title').prepend_to(dom('.story')) dom('.title').prependTo(dom('.story')) # 其中一個 # <p class="story"><p class="title"><b>The Dormouse's story</b></p>...</p>
# 從原本的位罝移到其他元件裡面後面,且放到所有的元件 dom('.title').append_to(dom('.story')) dom('.title').appendTo(dom('.story')) # 其中一個 # <p class="story">...<p class="title"><b>The Dormouse's story</b></p>
# 從原本的位罝移到其他元件前面,且放到所有元件的第一個 dom('b').insert_before(dom('.story')) dom('b').insertBefore(dom('.story')) #<b>The Dormouse's story</b><p class="story">Once upon a time there w
# 從原本的位罝移到其他元件後面,且放到所有元件的第一個 dom('.title').insert_after(dom('.story')) dom('.title').insertAfter(dom('.story')) # <p class="story">Once upon a time there were three little sisters; and their names were # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</p> # <p class="title"><b>The Dormouse's story</b></p>
# 清空內容 dom('p').empty() # <html><head><title>The Dormouse's story</title></head> # <body> # <p class="title"/> # <p class="story"/> # <p class="story"/></body></html>
dom('p').remove() # <html> # <head><title>The Dormouse's story</title></head> # <body> # </body></html>
dom('a').height('123') # <a href="http://example.com/elsie" class="sister" id="link1" height="123">Elsie</a> # <a href="http://example.com/lacie" class="sister" id="link2" height="123">Lacie</a> # <a href="http://example.com/tillie" class="sister" id="link3" height="123">Tillie</a>
dom('a').width('123') # <a href="http://example.com/elsie" class="sister" id="link1" width="123">Elsie</a> # <a href="http://example.com/lacie" class="sister" id="link2" width="123">Lacie</a> # <a href="http://example.com/tillie" class="sister" id="link3" width="123">Tillie</a>
dom('a').hide() # <a href="http://example.com/elsie" class="sister" id="link1" style="display: none">Elsie</a> # <a href="http://example.com/lacie" class="sister" id="link2" style="display: none">Lacie</a> # <a href="http://example.com/tillie" class="sister" id="link3" style="display: none">Tillie</a>
dom('a').show() # <a href="http://example.com/elsie" class="sister" id="link1" style="display: block">Elsie</a> # <a href="http://example.com/lacie" class="sister" id="link2" style="display: block">Lacie</a> # <a href="http://example.com/tillie" class="sister" id="link3" style="display: block">Tillie</a>
dom('p').html('<b>test</b>') # <p class="title"><b>test</b></p> # <p class="story"><b>test</b></p> # <p class="story"><b>test</b></p>
dom('a').replace_all('p') dom('a').replaceAll('p') dom('p').replace_with(dom('a')) dom('p').replaceWith(dom('a')) # <html> # <head><title>The Dormouse's story</title></head> # <body> # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well.</body></html>
dom('a').wrap('<div></div>') # <div><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,</div>, # <div><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and</div> and # <div><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div>
dom('a').wrap_all('<div></div>') dom('a').wrapAll('<div></div>') # <div> # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; # and they lived at the bottom of a well. # </div>
特殊方法
# 將連結路徑全改為絕對路徑 dom.make_links_absolute() # 若無 base_url 則需自行代入 dom('a').make_links_absolute(base_url="http://abc")
# e 為 <class 'lxml.html.HtmlElement'> # 故需再用 PyQuery 轉回來 dom('a').each(lambda i, e: print(i, PyQuery(e).attr.href)) dom('a').each(lambda i, e: print(i, PyQuery(this).attr.href)) # 仍會回傳 # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>] # 印出 # 0 http://example.com/elsie # 1 http://example.com/lacie # 2 http://example.com/tillie
dom('a').map(lambda i, e: PyQuery(e).attr.href) dom('a').map(lambda i, e: PyQuery(this).attr.href) # ['http://example.com/elsie', 'http://example.com/lacie', 'http://example.com/tillie']
# list 加入別的 PyQuery 物件 dom('.title').extend(dom('p').eq(2)) # [<p.title>, <p.story>]
# 以當前元件做判斷 dom('p').is_('b') # False dom('a').extend(dom('p')).is_('p') # True
doc = PyQuery('<html xmlns="http://www.w3.org/1999/xhtml"></html>') doc # [<{http://www.w3.org/1999/xhtml}html>] doc.xhtml_to_html() # [<html>]
# 自定 function fn = lambda: this.map(lambda i, e: PyQuery(this).outerHtml()) PyQuery.fn.listOuterHtml = fn dom('a').listOuterHtml() # ['<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>', # '<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>', # '<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>']
參考
比美麗的湯更美麗:pyqueryPython爬虫利器六之PyQuery的用法
留言
張貼留言