[Python] pyquery 教學

程式語言：Python
Package：pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學

功能：jquery-like 分析 html
jQuery Selectors

from pyquery import PyQuery

dom = PyQuery(url="https://www.google.com.tw/")
links = dom("a")

data = {}
for a in links.items():
    title = a.text().strip()
    data[title] = a.attr['href']

讀取網頁

from pyquery import PyQuery

# change parser
dom = PyQuery("<html></html>", parser='xml')
dom = PyQuery("<html></html>", parser='html')
dom = PyQuery("<html></html>", parser='html_fragments')

# by string
dom = PyQuery("<html></html>")

# by etree
from lxml import etree
dom = PyQuery(etree.fromstring("<html></html>"))

# by url
# 支援的 method，取決於是否有裝 requests
# 沒裝的話為 urllib2
dom = PyQuery(url='https://www.google.com.tw/')
dom = PyQuery('https://www.google.com.tw/')
# add cookies
dom = PyQuery('https://www.google.com.tw/', cookies=dict(over18='1'))
# add headers
dom = PyQuery('https://www.google.com.tw/', headers={'User-Agent': 'I am not a robot!'})

# by opener
from selenium.webdriver import Firefox

def selenium_opener(url):
    driver = Firefox()
    driver.get(url)
    html = driver.page_source
    driver.quit()
    return html

dom = PyQuery('https://www.google.com.tw/', opener=selenium_opener)

# by file
dom = PyQuery(filename=path_to_html_file)

使用範例

html 如下

from pyquery import PyQuery

html='''
<html>
<head><title>The Dormouse's story</title></head>
<body>
    <p class="title"><b>The Dormouse's story</b></p>
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>'''
dom = PyQuery(html)

訪問方法

幾乎等同 jQuery，可參考 [jQuery] 基本架構

可將 dom 視為 $，可理解到 pyquery & jQuery 差異極小

dom('a')
# [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]

dom('a.sister#link1')
# [<a#link1.sister>]

# 並不會是 PyQuery 物件，需注意
type(dom('a')[0])
# <class 'lxml.html.HtmlElement'>

# 四種解法
# 再用其他 attribute 選擇
dom('a')('#link1')
# 利用 items() 再轉為 list
list(dom('a').items())[0]
# 利用 PyQuery 再轉回來
PyQuery(dom('a')[0])
# 利用 eq
dom('a').eq(0)

搜尋方法

dom('p').children()
# [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
dom('p').children('#link1')
# [<a#link1.sister>]

dom('p.title').siblings()
# [<p.story>, <p.story>]
dom('a#link2').siblings()
# [<a#link1.sister>, <a#link3.sister>]
dom('a#link2').siblings('#link1')
# [<a#link1.sister>]

# 除 children 外，包含 text nodes
# element 為 <class 'lxml.html.HtmlElement'>
dom('body').contents()
# ['\n', <Element p at 0x3d211d8>, '\n\n', <Element p at 0x3d21228>, '\n\n', <Element p at 0x3d1c4f8>]

# 含自己往上找，找到最接近的 parents
dom('a').closest()
# [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
dom('a').closest('p')
# [<p.story>, <p.story>, <p.story>]

# 找到其 parent，只往上查找一層，會自動移除重覆的
dom('a').parent()
# [<p.story>]
dom('a').parent('body')
# []

# 找到其 parents，往上查找不停止，找出所有符合條件的，會自動移除重覆的
dom('a').parents()
# [<html>, <body>, <p.story>]
dom('a').parents('body')
# [<body>]

# 選擇第一個
dom('a').eq(0)
# [<a#link1.sister>]

# 可為 function，或是平常的 string
dom('p').filter(lambda i: PyQuery(this).text() == '...')
# [<p.story>]

dom('p').find('*')
# [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]

list(dom('a').items())
# [[<a#link1.sister>], [<a#link2.sister>], [<a#link3.sister>]]
list(dom('a').items('#link1'))
# [[<a#link1.sister>]]

dom('a').not_('#link1')
# [<a#link2.sister>, <a#link3.sister>]

dom('a#link2').next()
# [<a#link3.sister>]
dom('a#link2').next('a')
# [<a#link3.sister>]

dom('a#link2').prev()
# [<a#link1.sister>]
dom('a#link2').prev('a')
# [<a#link1.sister>]

dom('a#link1').next_all()
dom('a#link1').nextAll()
# [<a#link2.sister>, <a#link3.sister>]
dom('a#link1').next_all('#link3')
dom('a#link1').nextAll('#link3')
# [<a#link3.sister>]

dom('a#link3').prev_all()
dom('a#link3').prevAll()
# [<a#link1.sister>, <a#link2.sister>] 
dom('a#link3').prev_all('#link1')
dom('a#link3').prevAll('#link1')
# [<a#link1.sister>]

# 回傳上一個 traversal 
dom('a').parent().end()

Attributes

# 會以第一個 <a> 為主
dom('a').attr('id')
dom('a').attr.id
dom('a').attr['id']
# 'link1'

dom('a').attr('class_')
dom('a').attr.class_
dom('a').attr['class_']
# 'sister'

# 只要其中之一有就是 True
dom('p').has_class('story')
dom('p').hasClass('story')
# True

# 以第一個為主，物件高度，沒有會回傳 None
dom('p').height()
# None

# 以第一個為主，物件寬度，沒有會回傳 None
dom('p').width()
# None

# 回傳所有元件的 text
dom('title').text()
# "The Dormouse's story"

# 以第一個為主，回傳 value 值，沒有會回傳 None
dom('p').val()
# None

Properties

# 兩者意義一樣
dom('a').length # 3
dom('a').size() # 3

# 若無網址則回傳 None
dom.base_url
# None
domT = PyQuery('https://www.google.com.tw/')
domT.base_url
# 'https://www.google.com.tw/'

# 只適用有 root 的，不然會有錯誤，像是最上層的 dom
dom.root
# <lxml.etree._ElementTree object at 0x0000000003D23648>

# 只適用有 root 的，不然會有錯誤，像是最上層的 dom
dom.encoding
# 'ISO-8859-1'

網頁內容

dom('head').html()
# "<title>The Dormouse's story</title>"

dom('head').outer_html()
dom('head').outerHtml()
# "<head><title>The Dormouse's story</title></head>"

更改方法

更改會影響原始內容，且一次改所有的 element

# 複製原本的內容並回傳，可用在不想被更改的元件上
dom('.title').clone()

# 會將原本的替換掉
dom('a').attr('.class_') = "classA"
dom('a').attr.class_ = "classA"
dom('a').attr['class_'] = "classA"
# [<a#link1.classA>, <a#link2.classA>, <a#link3.classA>]
dom('a#link2').attr('id') = "link4"
dom('a#link2').attr.id = "link4"
dom('a#link2').attr['id'] = "link4"
# [<a#link1.sister>, <a#link4.sister>, <a#link3.sister>]

# 保留原本的
dom('a').add_class('classB')
dom('a').addClass('classB')
# [<a#link1.sister.classB>, <a#link2.sister.classB>, <a#link3.sister.classB>]

dom('p').remove_class('story')
dom('p').removeClass('story')
# [<p.title>, <p>, <p>]

dom('a').toggle_class('brother').toggle_class('sister')
dom('a').toggleClass('brother').toggleClass('sister')
# [<a#link1.brother>, <a#link2.brother>, <a#link3.brother>]

dom('a').val('123')
# <a href="http://example.com/elsie" class="sister" id="link1" value="123">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2" value="123">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3" value="123">Tillie</a>;

dom('a').remove_attr('class')
dom('a').removeAttr('class')
# [<a#link1>, <a#link2>, <a#link3>]

dom('.title').text('123')
# 原來的 <b> 被移除了
# <p class="title">123</p>

dom('a').css("font-size", "15px")
# <a href="http://example.com/elsie" class="sister" id="link1" style="font-size: 15px">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2" style="font-size: 15px">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3" style="font-size: 15px">Tillie</a>;
# and they lived at the bottom of a well.

doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
doc
# [<{http://example.com/foo}foo>]
doc.remove_namespaces()
# [<foo>]

dom('.title').before("<div>before</div>")
# <div>before</div><p class="title"><b>The Dormouse's story</b></p>

dom('.title').after("<div>after</div>")
# <p class="title"><b>The Dormouse's story</b></p>
# <div>after</div>

dom('.title').prepend("<div>prepend</div>")
# <p class="title"><div>prepend</div><b>The Dormouse's story</b></p>

dom('.title').append("<div>append</div>")
# <p class="title"><b>The Dormouse's story</b><div>append</div></p>

# 從原本的位罝移到其他元件裡面前面，且放到所有的元件
dom('.title').prepend_to(dom('.story'))
dom('.title').prependTo(dom('.story'))
# 其中一個
# <p class="story"><p class="title"><b>The Dormouse's story</b></p>...</p>

# 從原本的位罝移到其他元件裡面後面，且放到所有的元件
dom('.title').append_to(dom('.story'))
dom('.title').appendTo(dom('.story'))
# 其中一個
# <p class="story">...<p class="title"><b>The Dormouse's story</b></p>

# 從原本的位罝移到其他元件前面，且放到所有元件的第一個
dom('b').insert_before(dom('.story'))
dom('b').insertBefore(dom('.story'))
#<b>The Dormouse's story</b><p class="story">Once upon a time there w

# 從原本的位罝移到其他元件後面，且放到所有元件的第一個
dom('.title').insert_after(dom('.story'))
dom('.title').insertAfter(dom('.story'))
# <p class="story">Once upon a time there were three little sisters; and their names were
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</p>
# <p class="title"><b>The Dormouse's story</b></p>

# 清空內容
dom('p').empty()
# <html><head><title>The Dormouse's story</title></head>
# <body>
# <p class="title"/>
# <p class="story"/>
# <p class="story"/></body></html>

dom('p').remove()
# <html>
# <head><title>The Dormouse's story</title></head>
# <body>
# </body></html>

dom('a').height('123')
# <a href="http://example.com/elsie" class="sister" id="link1" height="123">Elsie</a>
# <a href="http://example.com/lacie" class="sister" id="link2" height="123">Lacie</a>
# <a href="http://example.com/tillie" class="sister" id="link3" height="123">Tillie</a>

dom('a').width('123')
# <a href="http://example.com/elsie" class="sister" id="link1" width="123">Elsie</a>
# <a href="http://example.com/lacie" class="sister" id="link2" width="123">Lacie</a>
# <a href="http://example.com/tillie" class="sister" id="link3" width="123">Tillie</a>

dom('a').hide()
# <a href="http://example.com/elsie" class="sister" id="link1" style="display: none">Elsie</a>
# <a href="http://example.com/lacie" class="sister" id="link2" style="display: none">Lacie</a>
# <a href="http://example.com/tillie" class="sister" id="link3" style="display: none">Tillie</a>

dom('a').show()
# <a href="http://example.com/elsie" class="sister" id="link1" style="display: block">Elsie</a>
# <a href="http://example.com/lacie" class="sister" id="link2" style="display: block">Lacie</a>
# <a href="http://example.com/tillie" class="sister" id="link3" style="display: block">Tillie</a>

dom('p').html('<b>test</b>')
# <p class="title"><b>test</b></p>
# <p class="story"><b>test</b></p>
# <p class="story"><b>test</b></p>

dom('a').replace_all('p')
dom('a').replaceAll('p')
dom('p').replace_with(dom('a'))
dom('p').replaceWith(dom('a'))
# <html>
# <head><title>The Dormouse's story</title></head>
# <body>
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.</body></html>

dom('a').wrap('<div></div>')
# <div><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,</div>,
# <div><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and</div> and
# <div><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div>

dom('a').wrap_all('<div></div>')
dom('a').wrapAll('<div></div>')
# <div>
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
# <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
# <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
# and they lived at the bottom of a well.
# </div>

特殊方法

# 將連結路徑全改為絕對路徑
dom.make_links_absolute()
# 若無 base_url 則需自行代入
dom('a').make_links_absolute(base_url="http://abc")

# e 為 <class 'lxml.html.HtmlElement'>
# 故需再用 PyQuery 轉回來
dom('a').each(lambda i, e: print(i, PyQuery(e).attr.href))
dom('a').each(lambda i, e: print(i, PyQuery(this).attr.href))
# 仍會回傳
# [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
# 印出
# 0 http://example.com/elsie
# 1 http://example.com/lacie
# 2 http://example.com/tillie

dom('a').map(lambda i, e: PyQuery(e).attr.href)
dom('a').map(lambda i, e: PyQuery(this).attr.href)
# ['http://example.com/elsie', 'http://example.com/lacie', 'http://example.com/tillie']

# list 加入別的 PyQuery 物件
dom('.title').extend(dom('p').eq(2))
# [<p.title>, <p.story>]

# 以當前元件做判斷
dom('p').is_('b')
# False
dom('a').extend(dom('p')).is_('p')
# True

doc = PyQuery('<html xmlns="http://www.w3.org/1999/xhtml"></html>')
doc
# [<{http://www.w3.org/1999/xhtml}html>]
doc.xhtml_to_html()
# [<html>]

# 自定 function
fn = lambda: this.map(lambda i, e: PyQuery(this).outerHtml())
PyQuery.fn.listOuterHtml = fn
dom('a').listOuterHtml()
# ['<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>', 
 # '<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>', 
 # '<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>']

參考

比美麗的湯更美麗：pyquery
Python爬虫利器六之PyQuery的用法

子風的知識庫

搜尋此網誌