[Python] pyquery 教學

程式語言:Python
Package:pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學

功能:jquery-like 分析 html
jQuery Selectors
  1. from pyquery import PyQuery
  2.  
  3. dom = PyQuery(url="https://www.google.com.tw/")
  4. links = dom("a")
  5.  
  6. data = {}
  7. for a in links.items():
  8. title = a.text().strip()
  9. data[title] = a.attr['href']

讀取網頁

  1. from pyquery import PyQuery
  2.  
  3. # change parser
  4. dom = PyQuery("<html></html>", parser='xml')
  5. dom = PyQuery("<html></html>", parser='html')
  6. dom = PyQuery("<html></html>", parser='html_fragments')
  1. # by string
  2. dom = PyQuery("<html></html>")
  1. # by etree
  2. from lxml import etree
  3. dom = PyQuery(etree.fromstring("<html></html>"))
  1. # by url
  2. # 支援的 method,取決於是否有裝 requests
  3. # 沒裝的話為 urllib2
  4. dom = PyQuery(url='https://www.google.com.tw/')
  5. dom = PyQuery('https://www.google.com.tw/')
  6. # add cookies
  7. dom = PyQuery('https://www.google.com.tw/', cookies=dict(over18='1'))
  8. # add headers
  9. dom = PyQuery('https://www.google.com.tw/', headers={'User-Agent': 'I am not a robot!'})
  1. # by opener
  2. from selenium.webdriver import Firefox
  3.  
  4. def selenium_opener(url):
  5. driver = Firefox()
  6. driver.get(url)
  7. html = driver.page_source
  8. driver.quit()
  9. return html
  10.  
  11. dom = PyQuery('https://www.google.com.tw/', opener=selenium_opener)
  1. # by file
  2. dom = PyQuery(filename=path_to_html_file)

使用範例

html 如下
from pyquery import PyQuery

html='''
<html>
<head><title>The Dormouse's story</title></head>
<body>
    <p class="title"><b>The Dormouse's story</b></p>
        <p class="story">Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>'''
dom = PyQuery(html)

訪問方法

幾乎等同 jQuery,可參考 [jQuery] 基本架構
可將 dom 視為 $,可理解到 pyquery & jQuery 差異極小
  1. dom('a')
  2. # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
  3.  
  4. dom('a.sister#link1')
  5. # [<a#link1.sister>]
  1. # 並不會是 PyQuery 物件,需注意
  2. type(dom('a')[0])
  3. # <class 'lxml.html.HtmlElement'>
  4.  
  5. # 四種解法
  6. # 再用其他 attribute 選擇
  7. dom('a')('#link1')
  8. # 利用 items() 再轉為 list
  9. list(dom('a').items())[0]
  10. # 利用 PyQuery 再轉回來
  11. PyQuery(dom('a')[0])
  12. # 利用 eq
  13. dom('a').eq(0)

搜尋方法

  1. dom('p').children()
  2. # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
  3. dom('p').children('#link1')
  4. # [<a#link1.sister>]
  1. dom('p.title').siblings()
  2. # [<p.story>, <p.story>]
  3. dom('a#link2').siblings()
  4. # [<a#link1.sister>, <a#link3.sister>]
  5. dom('a#link2').siblings('#link1')
  6. # [<a#link1.sister>]
  1. # 除 children 外,包含 text nodes
  2. # element 為 <class 'lxml.html.HtmlElement'>
  3. dom('body').contents()
  4. # ['\n', <Element p at 0x3d211d8>, '\n\n', <Element p at 0x3d21228>, '\n\n', <Element p at 0x3d1c4f8>]
  1. # 含自己往上找,找到最接近的 parents
  2. dom('a').closest()
  3. # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
  4. dom('a').closest('p')
  5. # [<p.story>, <p.story>, <p.story>]
  1. # 找到其 parent,只往上查找一層,會自動移除重覆的
  2. dom('a').parent()
  3. # [<p.story>]
  4. dom('a').parent('body')
  5. # []
  1. # 找到其 parents,往上查找不停止,找出所有符合條件的,會自動移除重覆的
  2. dom('a').parents()
  3. # [<html>, <body>, <p.story>]
  4. dom('a').parents('body')
  5. # [<body>]
  1. # 選擇第一個
  2. dom('a').eq(0)
  3. # [<a#link1.sister>]
  1. # 可為 function,或是平常的 string
  2. dom('p').filter(lambda i: PyQuery(this).text() == '...')
  3. # [<p.story>]
  1. dom('p').find('*')
  2. # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
  1. list(dom('a').items())
  2. # [[<a#link1.sister>], [<a#link2.sister>], [<a#link3.sister>]]
  3. list(dom('a').items('#link1'))
  4. # [[<a#link1.sister>]]
  1. dom('a').not_('#link1')
  2. # [<a#link2.sister>, <a#link3.sister>]
  1. dom('a#link2').next()
  2. # [<a#link3.sister>]
  3. dom('a#link2').next('a')
  4. # [<a#link3.sister>]
  1. dom('a#link2').prev()
  2. # [<a#link1.sister>]
  3. dom('a#link2').prev('a')
  4. # [<a#link1.sister>]
  1. dom('a#link1').next_all()
  2. dom('a#link1').nextAll()
  3. # [<a#link2.sister>, <a#link3.sister>]
  4. dom('a#link1').next_all('#link3')
  5. dom('a#link1').nextAll('#link3')
  6. # [<a#link3.sister>]
  1. dom('a#link3').prev_all()
  2. dom('a#link3').prevAll()
  3. # [<a#link1.sister>, <a#link2.sister>]
  4. dom('a#link3').prev_all('#link1')
  5. dom('a#link3').prevAll('#link1')
  6. # [<a#link1.sister>]
  1. # 回傳上一個 traversal
  2. dom('a').parent().end()

Attributes

  1. # 會以第一個 <a> 為主
  2. dom('a').attr('id')
  3. dom('a').attr.id
  4. dom('a').attr['id']
  5. # 'link1'
  6.  
  7. dom('a').attr('class_')
  8. dom('a').attr.class_
  9. dom('a').attr['class_']
  10. # 'sister'
  1. # 只要其中之一有就是 True
  2. dom('p').has_class('story')
  3. dom('p').hasClass('story')
  4. # True
  1. # 以第一個為主,物件高度,沒有會回傳 None
  2. dom('p').height()
  3. # None
  1. # 以第一個為主,物件寬度,沒有會回傳 None
  2. dom('p').width()
  3. # None
  1. # 回傳所有元件的 text
  2. dom('title').text()
  3. # "The Dormouse's story"
  1. # 以第一個為主,回傳 value 值,沒有會回傳 None
  2. dom('p').val()
  3. # None

Properties

  1. # 兩者意義一樣
  2. dom('a').length # 3
  3. dom('a').size() # 3
  1. # 若無網址則回傳 None
  2. dom.base_url
  3. # None
  4. domT = PyQuery('https://www.google.com.tw/')
  5. domT.base_url
  6. # 'https://www.google.com.tw/'
  1. # 只適用有 root 的,不然會有錯誤,像是最上層的 dom
  2. dom.root
  3. # <lxml.etree._ElementTree object at 0x0000000003D23648>
  1. # 只適用有 root 的,不然會有錯誤,像是最上層的 dom
  2. dom.encoding
  3. # 'ISO-8859-1'

網頁內容

  1. dom('head').html()
  2. # "<title>The Dormouse's story</title>"
  1. dom('head').outer_html()
  2. dom('head').outerHtml()
  3. # "<head><title>The Dormouse's story</title></head>"

更改方法

更改會影響原始內容,且一次改所有的 element
  1. # 複製原本的內容並回傳,可用在不想被更改的元件上
  2. dom('.title').clone()
  1. # 會將原本的替換掉
  2. dom('a').attr('.class_') = "classA"
  3. dom('a').attr.class_ = "classA"
  4. dom('a').attr['class_'] = "classA"
  5. # [<a#link1.classA>, <a#link2.classA>, <a#link3.classA>]
  6. dom('a#link2').attr('id') = "link4"
  7. dom('a#link2').attr.id = "link4"
  8. dom('a#link2').attr['id'] = "link4"
  9. # [<a#link1.sister>, <a#link4.sister>, <a#link3.sister>]
  1. # 保留原本的
  2. dom('a').add_class('classB')
  3. dom('a').addClass('classB')
  4. # [<a#link1.sister.classB>, <a#link2.sister.classB>, <a#link3.sister.classB>]
  1. dom('p').remove_class('story')
  2. dom('p').removeClass('story')
  3. # [<p.title>, <p>, <p>]
  1. dom('a').toggle_class('brother').toggle_class('sister')
  2. dom('a').toggleClass('brother').toggleClass('sister')
  3. # [<a#link1.brother>, <a#link2.brother>, <a#link3.brother>]
  1. dom('a').val('123')
  2. # <a href="http://example.com/elsie" class="sister" id="link1" value="123">Elsie</a>,
  3. # <a href="http://example.com/lacie" class="sister" id="link2" value="123">Lacie</a> and
  4. # <a href="http://example.com/tillie" class="sister" id="link3" value="123">Tillie</a>;
  1. dom('a').remove_attr('class')
  2. dom('a').removeAttr('class')
  3. # [<a#link1>, <a#link2>, <a#link3>]
  1. dom('.title').text('123')
  2. # 原來的 <b> 被移除了
  3. # <p class="title">123</p>
  1. dom('a').css("font-size", "15px")
  2. # <a href="http://example.com/elsie" class="sister" id="link1" style="font-size: 15px">Elsie</a>,
  3. # <a href="http://example.com/lacie" class="sister" id="link2" style="font-size: 15px">Lacie</a> and
  4. # <a href="http://example.com/tillie" class="sister" id="link3" style="font-size: 15px">Tillie</a>;
  5. # and they lived at the bottom of a well.
  1. doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
  2. doc
  3. # [<{http://example.com/foo}foo>]
  4. doc.remove_namespaces()
  5. # [<foo>]
  1. dom('.title').before("<div>before</div>")
  2. # <div>before</div><p class="title"><b>The Dormouse's story</b></p>
  1. dom('.title').after("<div>after</div>")
  2. # <p class="title"><b>The Dormouse's story</b></p>
  3. # <div>after</div>
  1. dom('.title').prepend("<div>prepend</div>")
  2. # <p class="title"><div>prepend</div><b>The Dormouse's story</b></p>
  1. dom('.title').append("<div>append</div>")
  2. # <p class="title"><b>The Dormouse's story</b><div>append</div></p>
  1. # 從原本的位罝移到其他元件裡面前面,且放到所有的元件
  2. dom('.title').prepend_to(dom('.story'))
  3. dom('.title').prependTo(dom('.story'))
  4. # 其中一個
  5. # <p class="story"><p class="title"><b>The Dormouse's story</b></p>...</p>
  1. # 從原本的位罝移到其他元件裡面後面,且放到所有的元件
  2. dom('.title').append_to(dom('.story'))
  3. dom('.title').appendTo(dom('.story'))
  4. # 其中一個
  5. # <p class="story">...<p class="title"><b>The Dormouse's story</b></p>
  1. # 從原本的位罝移到其他元件前面,且放到所有元件的第一個
  2. dom('b').insert_before(dom('.story'))
  3. dom('b').insertBefore(dom('.story'))
  4. #<b>The Dormouse's story</b><p class="story">Once upon a time there w
  1. # 從原本的位罝移到其他元件後面,且放到所有元件的第一個
  2. dom('.title').insert_after(dom('.story'))
  3. dom('.title').insertAfter(dom('.story'))
  4. # <p class="story">Once upon a time there were three little sisters; and their names were
  5. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  6. # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  7. # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  8. # and they lived at the bottom of a well.</p>
  9. # <p class="title"><b>The Dormouse's story</b></p>
  1. # 清空內容
  2. dom('p').empty()
  3. # <html><head><title>The Dormouse's story</title></head>
  4. # <body>
  5. # <p class="title"/>
  6. # <p class="story"/>
  7. # <p class="story"/></body></html>
  1. dom('p').remove()
  2. # <html>
  3. # <head><title>The Dormouse's story</title></head>
  4. # <body>
  5. # </body></html>
  1. dom('a').height('123')
  2. # <a href="http://example.com/elsie" class="sister" id="link1" height="123">Elsie</a>
  3. # <a href="http://example.com/lacie" class="sister" id="link2" height="123">Lacie</a>
  4. # <a href="http://example.com/tillie" class="sister" id="link3" height="123">Tillie</a>
  1. dom('a').width('123')
  2. # <a href="http://example.com/elsie" class="sister" id="link1" width="123">Elsie</a>
  3. # <a href="http://example.com/lacie" class="sister" id="link2" width="123">Lacie</a>
  4. # <a href="http://example.com/tillie" class="sister" id="link3" width="123">Tillie</a>
  1. dom('a').hide()
  2. # <a href="http://example.com/elsie" class="sister" id="link1" style="display: none">Elsie</a>
  3. # <a href="http://example.com/lacie" class="sister" id="link2" style="display: none">Lacie</a>
  4. # <a href="http://example.com/tillie" class="sister" id="link3" style="display: none">Tillie</a>
  1. dom('a').show()
  2. # <a href="http://example.com/elsie" class="sister" id="link1" style="display: block">Elsie</a>
  3. # <a href="http://example.com/lacie" class="sister" id="link2" style="display: block">Lacie</a>
  4. # <a href="http://example.com/tillie" class="sister" id="link3" style="display: block">Tillie</a>
  1. dom('p').html('<b>test</b>')
  2. # <p class="title"><b>test</b></p>
  3. # <p class="story"><b>test</b></p>
  4. # <p class="story"><b>test</b></p>
  1. dom('a').replace_all('p')
  2. dom('a').replaceAll('p')
  3. dom('p').replace_with(dom('a'))
  4. dom('p').replaceWith(dom('a'))
  5. # <html>
  6. # <head><title>The Dormouse's story</title></head>
  7. # <body>
  8. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  9. # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  10. # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  11. # and they lived at the bottom of a well.
  12. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  13. # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  14. # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  15. # and they lived at the bottom of a well.
  16. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  17. # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  18. # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  19. # and they lived at the bottom of a well.</body></html>
  1. dom('a').wrap('<div></div>')
  2. # <div><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,</div>,
  3. # <div><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and</div> and
  4. # <div><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div>
  1. dom('a').wrap_all('<div></div>')
  2. dom('a').wrapAll('<div></div>')
  3. # <div>
  4. # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
  5. # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
  6. # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
  7. # and they lived at the bottom of a well.
  8. # </div>

特殊方法

  1. # 將連結路徑全改為絕對路徑
  2. dom.make_links_absolute()
  3. # 若無 base_url 則需自行代入
  4. dom('a').make_links_absolute(base_url="http://abc")
  1. # e 為 <class 'lxml.html.HtmlElement'>
  2. # 故需再用 PyQuery 轉回來
  3. dom('a').each(lambda i, e: print(i, PyQuery(e).attr.href))
  4. dom('a').each(lambda i, e: print(i, PyQuery(this).attr.href))
  5. # 仍會回傳
  6. # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
  7. # 印出
  8. # 0 http://example.com/elsie
  9. # 1 http://example.com/lacie
  10. # 2 http://example.com/tillie
  1. dom('a').map(lambda i, e: PyQuery(e).attr.href)
  2. dom('a').map(lambda i, e: PyQuery(this).attr.href)
  3. # ['http://example.com/elsie', 'http://example.com/lacie', 'http://example.com/tillie']
  1. # list 加入別的 PyQuery 物件
  2. dom('.title').extend(dom('p').eq(2))
  3. # [<p.title>, <p.story>]
  1. # 以當前元件做判斷
  2. dom('p').is_('b')
  3. # False
  4. dom('a').extend(dom('p')).is_('p')
  5. # True
  1. doc = PyQuery('<html xmlns="http://www.w3.org/1999/xhtml"></html>')
  2. doc
  3. # [<{http://www.w3.org/1999/xhtml}html>]
  4. doc.xhtml_to_html()
  5. # [<html>]
  1. # 自定 function
  2. fn = lambda: this.map(lambda i, e: PyQuery(this).outerHtml())
  3. PyQuery.fn.listOuterHtml = fn
  4. dom('a').listOuterHtml()
  5. # ['<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>',
  6. # '<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>',
  7. # '<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>']

參考

比美麗的湯更美麗:pyquery
Python爬虫利器六之PyQuery的用法

留言