[PySide] using QtWebKit to get all info of web pages including that generated by AJAX

Srini Kommoori vasure at gmail.com
Mon Oct 22 17:52:08 CEST 2012


Initially I thought you are missing scrolling but looks like you are
only interested in getting the content either in text or html. I am
not definite but some websites respond to particular clients really
well. You can play with client string and see how it behaves.


On Mon, Oct 22, 2012 at 1:11 AM, flyer <flyer103 at gmail.com> wrote:
> I wrote a python script using QtWebKit to get all page info including info
> generated by AJAX requests. I run the following code on CentOS Server and do
> the following settings:
>
>>
>> $ Xvfb :100 -screen 0 9000x15000x24 &
>>
>> $ export DISPLAY=:100
>
>
> The following code worked, however, it could only get one-screen info of the
> web page, namely, getting different amount of info according to the screen
> resolution. I could only get part of the info of the webpage.
>
> I have tried using selenium and I can get all web info if I set large screen
> resolution using Xvfb .
>
> Please give me some tips about how to solve the problem and any manual for
> QtWebKit is also appreciated because I can't find more materials about it.
>
> The following is my code:
>
>>> #!/usr/bin/env python
>>>
>>> #coding: utf-8
>>>
>>>
>>>
>>> import sys
>>>
>>> import time
>>>
>>>
>>> from PySide.QtCore import QUrl, SIGNAL
>>>
>>> from PySide.QtGui import QApplication
>>>
>>> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
>>>
>>> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
>>>
>>>
>>> reload(sys)
>>>
>>> sys.setdefaultencoding('utf-8')
>>>
>>>
>>> fn_log = 'url_dd.txt'
>>>
>>> fp_log = open(fn_log, 'ab+')
>>>
>>>
>>> class WebPage(QWebPage):
>>>
>>>
>>>     def __init__(self, logger=None, parent=None):
>>>
>>>         super(WebPage, self).__init__(parent)
>>>
>>>
>>>
>>>     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
>>>
>>>         sys.stderr.write('Javascritp error at line number %d\n' %
>>> (lineNumber))
>>>
>>>         sys.stderr.write('%s\n' % (message, ))
>>>
>>>         sys.stderr.write('Source ID: %s\n' % (sourceID, ))
>>>
>>>
>>>
>>> class Crawler(QApplication):
>>>
>>>
>>>
>>>     def __init__(self, url):
>>>
>>>         super(Crawler, self).__init__(sys.argv)
>>>
>>>
>>>
>>>         self.url = url
>>>
>>>         self.web_view = QWebView()
>>>
>>>         self.web_page = WebPage()
>>>
>>>         self.web_view.setPage(self.web_page)
>>>
>>>         self.web_frame = self.web_page.mainFrame()
>>>
>>>
>>>         self.network = NetworkAccessManager()
>>>
>>>         self.web_page.setNetworkAccessManager(self.network)
>>>
>>>
>>>
>>>         self.settings = self.web_page.settings().globalSettings()
>>>
>>>         self.settings.setAttribute(QWebSettings.AutoLoadImages, False)
>>>
>>>         self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
>>>
>>>         QWebSettings.clearMemoryCaches()
>>>
>>>
>>>         self.web_view.resize(1024, 9000)
>>>
>>>
>>>         self.connect(self.web_page, SIGNAL('loadFinished(bool)'),
>>> self.loadFinished)
>>>
>>>
>>>         print 'Before loading'
>>>
>>>         self.web_view.load(QUrl(self.url))
>>>
>>>         print 'After loading'
>>>
>>>
>>>     def loadFinished(self, ok):
>>>
>>>         print 'Start loadFinished()'
>>>
>>>
>>>         print 'Start writing'
>>>
>>>         with open('content_dd.txt', 'ab+') as fp:
>>>
>>>             fp.write(self.web_frame.toHtml().toUtf8())
>>>
>>>         print 'End writing'
>>>
>>>
>>>
>>>         print 'End loadFinished()'
>>>
>>>
>>>         try:
>>>
>>>             self.quit()
>>>
>>>         except Exception, e:
>>>
>>>             print 'FATAL ERROR: %s' % (str(e), )
>>>
>>>
>>>
>>> class NetworkAccessManager(QNetworkAccessManager):
>>>
>>>
>>>
>>>     def __init__(self):
>>>
>>>         super(NetworkAccessManager, self).__init__()
>>>
>>>         # QNetworkAccessManager.__init__(self)
>>>
>>>         self.connect(self, SIGNAL('finished (QNetworkReply *)'),
>>> self.finishd)
>>>
>>>
>>>
>>>     def createRequest(self, operation, request, data):
>>>
>>>         # url = request.url().toString()
>>>
>>>         self.setNetworkAccessible(self.Accessible)
>>>
>>>
>>>
>>>         return QNetworkAccessManager.createRequest(self, operation,
>>> request, data)
>>>
>>>
>>>     def finishd(self, reply):
>>>
>>>         print 'In NetworkAccessManager finishd'
>>>
>>>         url = str(reply.url().toString())
>>>
>>>
>>>
>>>         log = '%s: %s\n' % (time.ctime(), url)
>>>
>>>         fp_log.write(log)
>>>
>>>
>>>         print url
>>>
>>>
>>>
>>> if __name__ == '__main__':
>>>
>>>     # url =
>>> 'http://product.dangdang.com/product.aspx?product_id=22822333'
>>>
>>>     url = 'http://product.dangdang.com/product.aspx?product_id=22848707'
>>>
>>>     crawler = Crawler(url)
>>>
>>>     sys.exit(crawler.exec_())
>>
>>
>
> --
> 宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。
>
>
>
> _______________________________________________
> PySide mailing list
> PySide at qt-project.org
> http://lists.qt-project.org/mailman/listinfo/pyside
>



More information about the PySide mailing list