res = requests.get(url) html = etree.HTML(res.text) contents = html.xpaht('//div/xxxx')
然后遇到了如下的错误信息:
1 2 3 4 5 6 7 8
Traceback (most recent call last): File "xxxxxxxx.py", line 157, in <module> get_website_title_content(url) File "xxxxxxxx.py", line 141, in get_website_title_content html = etree.HTML(html_text) File "src\lxml\etree.pyx", line 3170, in lxml.etree.HTML File "src\lxml\parser.pxi", line 1872, in lxml.etree._parseMemoryDocument ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
关键错误就是 ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
self._content_consumed = True # don't need to release the connection; that's been handled by urllib3 # since we exhausted the data. return self._content
@property deftext(self): """Content of the response, in unicode. If Response.encoding is None, encoding will be guessed using ``chardet``. The encoding of the response content is determined based solely on HTTP headers, following RFC 2616 to the letter. If you can take advantage of non-HTTP knowledge to make a better guess at the encoding, you should set ``r.encoding`` appropriately before accessing this property. """
# Fallback to auto-detected encoding. if self.encoding isNone: encoding = self.apparent_encoding
# Decode unicode from given encoding. try: content = str(self.content, encoding, errors='replace') except (LookupError, TypeError): # A LookupError is raised if the encoding was not found which could # indicate a misspelling or similar mistake. # # A TypeError can be raised if encoding is None # # So we try blindly encoding. content = str(self.content, errors='replace')