程式語言:Python
Package:pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學
功能:jquery-like 分析 html
jQuery Selectors
Python爬虫利器六之PyQuery的用法
Package:pyquery
官方文件
官方 GitHub
對比
[Python] Beautifulsoup4 教學
功能:jquery-like 分析 html
jQuery Selectors
- from pyquery import PyQuery
- dom = PyQuery(url="https://www.google.com.tw/")
- links = dom("a")
- data = {}
- for a in links.items():
- title = a.text().strip()
- data[title] = a.attr['href']
讀取網頁
- from pyquery import PyQuery
- # change parser
- dom = PyQuery("<html></html>", parser='xml')
- dom = PyQuery("<html></html>", parser='html')
- dom = PyQuery("<html></html>", parser='html_fragments')
- # by string
- dom = PyQuery("<html></html>")
- # by etree
- from lxml import etree
- dom = PyQuery(etree.fromstring("<html></html>"))
- # by url
- # 支援的 method,取決於是否有裝 requests
- # 沒裝的話為 urllib2
- dom = PyQuery(url='https://www.google.com.tw/')
- dom = PyQuery('https://www.google.com.tw/')
- # add cookies
- dom = PyQuery('https://www.google.com.tw/', cookies=dict(over18='1'))
- # add headers
- dom = PyQuery('https://www.google.com.tw/', headers={'User-Agent': 'I am not a robot!'})
- # by opener
- from selenium.webdriver import Firefox
- def selenium_opener(url):
- driver = Firefox()
- driver.get(url)
- html = driver.page_source
- driver.quit()
- return html
- dom = PyQuery('https://www.google.com.tw/', opener=selenium_opener)
- # by file
- dom = PyQuery(filename=path_to_html_file)
使用範例
html 如下from pyquery import PyQuery html=''' <html> <head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p>''' dom = PyQuery(html)
訪問方法
幾乎等同 jQuery,可參考 [jQuery] 基本架構
可將 dom 視為 $,可理解到 pyquery & jQuery 差異極小
- dom('a')
- # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
- dom('a.sister#link1')
- # [<a#link1.sister>]
- # 並不會是 PyQuery 物件,需注意
- type(dom('a')[0])
- # <class 'lxml.html.HtmlElement'>
- # 四種解法
- # 再用其他 attribute 選擇
- dom('a')('#link1')
- # 利用 items() 再轉為 list
- list(dom('a').items())[0]
- # 利用 PyQuery 再轉回來
- PyQuery(dom('a')[0])
- # 利用 eq
- dom('a').eq(0)
搜尋方法
- dom('p').children()
- # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
- dom('p').children('#link1')
- # [<a#link1.sister>]
- dom('p.title').siblings()
- # [<p.story>, <p.story>]
- dom('a#link2').siblings()
- # [<a#link1.sister>, <a#link3.sister>]
- dom('a#link2').siblings('#link1')
- # [<a#link1.sister>]
- # 除 children 外,包含 text nodes
- # element 為 <class 'lxml.html.HtmlElement'>
- dom('body').contents()
- # ['\n', <Element p at 0x3d211d8>, '\n\n', <Element p at 0x3d21228>, '\n\n', <Element p at 0x3d1c4f8>]
- # 含自己往上找,找到最接近的 parents
- dom('a').closest()
- # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
- dom('a').closest('p')
- # [<p.story>, <p.story>, <p.story>]
- # 找到其 parent,只往上查找一層,會自動移除重覆的
- dom('a').parent()
- # [<p.story>]
- dom('a').parent('body')
- # []
- # 找到其 parents,往上查找不停止,找出所有符合條件的,會自動移除重覆的
- dom('a').parents()
- # [<html>, <body>, <p.story>]
- dom('a').parents('body')
- # [<body>]
- # 選擇第一個
- dom('a').eq(0)
- # [<a#link1.sister>]
- # 可為 function,或是平常的 string
- dom('p').filter(lambda i: PyQuery(this).text() == '...')
- # [<p.story>]
- dom('p').find('*')
- # [<b>, <a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
- list(dom('a').items())
- # [[<a#link1.sister>], [<a#link2.sister>], [<a#link3.sister>]]
- list(dom('a').items('#link1'))
- # [[<a#link1.sister>]]
- dom('a').not_('#link1')
- # [<a#link2.sister>, <a#link3.sister>]
- dom('a#link2').next()
- # [<a#link3.sister>]
- dom('a#link2').next('a')
- # [<a#link3.sister>]
- dom('a#link2').prev()
- # [<a#link1.sister>]
- dom('a#link2').prev('a')
- # [<a#link1.sister>]
- dom('a#link1').next_all()
- dom('a#link1').nextAll()
- # [<a#link2.sister>, <a#link3.sister>]
- dom('a#link1').next_all('#link3')
- dom('a#link1').nextAll('#link3')
- # [<a#link3.sister>]
- dom('a#link3').prev_all()
- dom('a#link3').prevAll()
- # [<a#link1.sister>, <a#link2.sister>]
- dom('a#link3').prev_all('#link1')
- dom('a#link3').prevAll('#link1')
- # [<a#link1.sister>]
- # 回傳上一個 traversal
- dom('a').parent().end()
Attributes
- # 會以第一個 <a> 為主
- dom('a').attr('id')
- dom('a').attr.id
- dom('a').attr['id']
- # 'link1'
- dom('a').attr('class_')
- dom('a').attr.class_
- dom('a').attr['class_']
- # 'sister'
- # 只要其中之一有就是 True
- dom('p').has_class('story')
- dom('p').hasClass('story')
- # True
- # 以第一個為主,物件高度,沒有會回傳 None
- dom('p').height()
- # None
- # 以第一個為主,物件寬度,沒有會回傳 None
- dom('p').width()
- # None
- # 回傳所有元件的 text
- dom('title').text()
- # "The Dormouse's story"
- # 以第一個為主,回傳 value 值,沒有會回傳 None
- dom('p').val()
- # None
Properties
- # 兩者意義一樣
- dom('a').length # 3
- dom('a').size() # 3
- # 若無網址則回傳 None
- dom.base_url
- # None
- domT = PyQuery('https://www.google.com.tw/')
- domT.base_url
- # 'https://www.google.com.tw/'
- # 只適用有 root 的,不然會有錯誤,像是最上層的 dom
- dom.root
- # <lxml.etree._ElementTree object at 0x0000000003D23648>
- # 只適用有 root 的,不然會有錯誤,像是最上層的 dom
- dom.encoding
- # 'ISO-8859-1'
網頁內容
- dom('head').html()
- # "<title>The Dormouse's story</title>"
- dom('head').outer_html()
- dom('head').outerHtml()
- # "<head><title>The Dormouse's story</title></head>"
更改方法
更改會影響原始內容,且一次改所有的 element
- # 複製原本的內容並回傳,可用在不想被更改的元件上
- dom('.title').clone()
- # 會將原本的替換掉
- dom('a').attr('.class_') = "classA"
- dom('a').attr.class_ = "classA"
- dom('a').attr['class_'] = "classA"
- # [<a#link1.classA>, <a#link2.classA>, <a#link3.classA>]
- dom('a#link2').attr('id') = "link4"
- dom('a#link2').attr.id = "link4"
- dom('a#link2').attr['id'] = "link4"
- # [<a#link1.sister>, <a#link4.sister>, <a#link3.sister>]
- # 保留原本的
- dom('a').add_class('classB')
- dom('a').addClass('classB')
- # [<a#link1.sister.classB>, <a#link2.sister.classB>, <a#link3.sister.classB>]
- dom('p').remove_class('story')
- dom('p').removeClass('story')
- # [<p.title>, <p>, <p>]
- dom('a').toggle_class('brother').toggle_class('sister')
- dom('a').toggleClass('brother').toggleClass('sister')
- # [<a#link1.brother>, <a#link2.brother>, <a#link3.brother>]
- dom('a').val('123')
- # <a href="http://example.com/elsie" class="sister" id="link1" value="123">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2" value="123">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3" value="123">Tillie</a>;
- dom('a').remove_attr('class')
- dom('a').removeAttr('class')
- # [<a#link1>, <a#link2>, <a#link3>]
- dom('.title').text('123')
- # 原來的 <b> 被移除了
- # <p class="title">123</p>
- dom('a').css("font-size", "15px")
- # <a href="http://example.com/elsie" class="sister" id="link1" style="font-size: 15px">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2" style="font-size: 15px">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3" style="font-size: 15px">Tillie</a>;
- # and they lived at the bottom of a well.
- doc = PyQuery('<foo xmlns="http://example.com/foo"></foo>')
- doc
- # [<{http://example.com/foo}foo>]
- doc.remove_namespaces()
- # [<foo>]
- dom('.title').before("<div>before</div>")
- # <div>before</div><p class="title"><b>The Dormouse's story</b></p>
- dom('.title').after("<div>after</div>")
- # <p class="title"><b>The Dormouse's story</b></p>
- # <div>after</div>
- dom('.title').prepend("<div>prepend</div>")
- # <p class="title"><div>prepend</div><b>The Dormouse's story</b></p>
- dom('.title').append("<div>append</div>")
- # <p class="title"><b>The Dormouse's story</b><div>append</div></p>
- # 從原本的位罝移到其他元件裡面前面,且放到所有的元件
- dom('.title').prepend_to(dom('.story'))
- dom('.title').prependTo(dom('.story'))
- # 其中一個
- # <p class="story"><p class="title"><b>The Dormouse's story</b></p>...</p>
- # 從原本的位罝移到其他元件裡面後面,且放到所有的元件
- dom('.title').append_to(dom('.story'))
- dom('.title').appendTo(dom('.story'))
- # 其中一個
- # <p class="story">...<p class="title"><b>The Dormouse's story</b></p>
- # 從原本的位罝移到其他元件前面,且放到所有元件的第一個
- dom('b').insert_before(dom('.story'))
- dom('b').insertBefore(dom('.story'))
- #<b>The Dormouse's story</b><p class="story">Once upon a time there w
- # 從原本的位罝移到其他元件後面,且放到所有元件的第一個
- dom('.title').insert_after(dom('.story'))
- dom('.title').insertAfter(dom('.story'))
- # <p class="story">Once upon a time there were three little sisters; and their names were
- # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- # and they lived at the bottom of a well.</p>
- # <p class="title"><b>The Dormouse's story</b></p>
- # 清空內容
- dom('p').empty()
- # <html><head><title>The Dormouse's story</title></head>
- # <body>
- # <p class="title"/>
- # <p class="story"/>
- # <p class="story"/></body></html>
- dom('p').remove()
- # <html>
- # <head><title>The Dormouse's story</title></head>
- # <body>
- # </body></html>
- dom('a').height('123')
- # <a href="http://example.com/elsie" class="sister" id="link1" height="123">Elsie</a>
- # <a href="http://example.com/lacie" class="sister" id="link2" height="123">Lacie</a>
- # <a href="http://example.com/tillie" class="sister" id="link3" height="123">Tillie</a>
- dom('a').width('123')
- # <a href="http://example.com/elsie" class="sister" id="link1" width="123">Elsie</a>
- # <a href="http://example.com/lacie" class="sister" id="link2" width="123">Lacie</a>
- # <a href="http://example.com/tillie" class="sister" id="link3" width="123">Tillie</a>
- dom('a').hide()
- # <a href="http://example.com/elsie" class="sister" id="link1" style="display: none">Elsie</a>
- # <a href="http://example.com/lacie" class="sister" id="link2" style="display: none">Lacie</a>
- # <a href="http://example.com/tillie" class="sister" id="link3" style="display: none">Tillie</a>
- dom('a').show()
- # <a href="http://example.com/elsie" class="sister" id="link1" style="display: block">Elsie</a>
- # <a href="http://example.com/lacie" class="sister" id="link2" style="display: block">Lacie</a>
- # <a href="http://example.com/tillie" class="sister" id="link3" style="display: block">Tillie</a>
- dom('p').html('<b>test</b>')
- # <p class="title"><b>test</b></p>
- # <p class="story"><b>test</b></p>
- # <p class="story"><b>test</b></p>
- dom('a').replace_all('p')
- dom('a').replaceAll('p')
- dom('p').replace_with(dom('a'))
- dom('p').replaceWith(dom('a'))
- # <html>
- # <head><title>The Dormouse's story</title></head>
- # <body>
- # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- # and they lived at the bottom of a well.
- # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- # and they lived at the bottom of a well.
- # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- # and they lived at the bottom of a well.</body></html>
- dom('a').wrap('<div></div>')
- # <div><a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,</div>,
- # <div><a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and</div> and
- # <div><a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</div>
- dom('a').wrap_all('<div></div>')
- dom('a').wrapAll('<div></div>')
- # <div>
- # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
- # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
- # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
- # and they lived at the bottom of a well.
- # </div>
特殊方法
- # 將連結路徑全改為絕對路徑
- dom.make_links_absolute()
- # 若無 base_url 則需自行代入
- dom('a').make_links_absolute(base_url="http://abc")
- # e 為 <class 'lxml.html.HtmlElement'>
- # 故需再用 PyQuery 轉回來
- dom('a').each(lambda i, e: print(i, PyQuery(e).attr.href))
- dom('a').each(lambda i, e: print(i, PyQuery(this).attr.href))
- # 仍會回傳
- # [<a#link1.sister>, <a#link2.sister>, <a#link3.sister>]
- # 印出
- # 0 http://example.com/elsie
- # 1 http://example.com/lacie
- # 2 http://example.com/tillie
- dom('a').map(lambda i, e: PyQuery(e).attr.href)
- dom('a').map(lambda i, e: PyQuery(this).attr.href)
- # ['http://example.com/elsie', 'http://example.com/lacie', 'http://example.com/tillie']
- # list 加入別的 PyQuery 物件
- dom('.title').extend(dom('p').eq(2))
- # [<p.title>, <p.story>]
- # 以當前元件做判斷
- dom('p').is_('b')
- # False
- dom('a').extend(dom('p')).is_('p')
- # True
- doc = PyQuery('<html xmlns="http://www.w3.org/1999/xhtml"></html>')
- doc
- # [<{http://www.w3.org/1999/xhtml}html>]
- doc.xhtml_to_html()
- # [<html>]
- # 自定 function
- fn = lambda: this.map(lambda i, e: PyQuery(this).outerHtml())
- PyQuery.fn.listOuterHtml = fn
- dom('a').listOuterHtml()
- # ['<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>',
- # '<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>',
- # '<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>']
參考
比美麗的湯更美麗:pyqueryPython爬虫利器六之PyQuery的用法
留言
張貼留言