[PySide] using QtWebKit to get all info of web pages including that generated by AJAX

flyer flyer103 at gmail.com
Mon Oct 22 10:11:32 CEST 2012


I wrote a python script using *QtWebKit* to get all page info including
info generated by AJAX requests. I run the following code on CentOS Server
and do the following settings:


> $ Xvfb :100 -screen 0 9000x15000x24 &

$ export DISPLAY=:100


The following code worked, however, it could only get *one-screen* info of
the web page, namely, getting different amount of info according to the
screen resolution. I could only get part of the info of the webpage.

I have tried using *selenium *and I can get all web info if I set large
screen resolution using *Xvfb* .

Please give me some tips about how to solve the problem and any manual for *
QtWebKit *is also appreciated because I can't find more materials about it.

The following is my code:

#!/usr/bin/env python
>
> #coding: utf-8
>
>
>>
>> import sys
>
> import time
>
>
>> from PySide.QtCore import QUrl, SIGNAL
>
> from PySide.QtGui import QApplication
>
> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
>
> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
>
>
>> reload(sys)
>
> sys.setdefaultencoding('utf-8')
>
>
>> fn_log = 'url_dd.txt'
>
> fp_log = open(fn_log, 'ab+')
>
>
>> class WebPage(QWebPage):
>
>
>>     def __init__(self, logger=None, parent=None):
>
>         super(WebPage, self).__init__(parent)
>
>
>
>     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
>
>         sys.stderr.write('Javascritp error at line number %d\n' %
>> (lineNumber))
>
>         sys.stderr.write('%s\n' % (message, ))
>
>         sys.stderr.write('Source ID: %s\n' % (sourceID, ))
>
>
>>
>> class Crawler(QApplication):
>
>
>
>     def __init__(self, url):
>
>         super(Crawler, self).__init__(sys.argv)
>
>
>
>         self.url = url
>
>         self.web_view = QWebView()
>
>         self.web_page = WebPage()
>
>         self.web_view.setPage(self.web_page)
>
>         self.web_frame = self.web_page.mainFrame()
>
>
>>         self.network = NetworkAccessManager()
>
>         self.web_page.setNetworkAccessManager(self.network)
>
>
>
>         self.settings = self.web_page.settings().globalSettings()
>
>         self.settings.setAttribute(QWebSettings.AutoLoadImages, False)
>
>         self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
>
>         QWebSettings.clearMemoryCaches()
>
>
>>         self.web_view.resize(1024, 9000)
>
>
>>         self.connect(self.web_page, SIGNAL('loadFinished(bool)'),
>> self.loadFinished)
>
>
>>         print 'Before loading'
>
>         self.web_view.load(QUrl(self.url))
>
>         print 'After loading'
>
>
>>     def loadFinished(self, ok):
>
>         print 'Start loadFinished()'
>
>
>>         print 'Start writing'
>
>         with open('content_dd.txt', 'ab+') as fp:
>
>             fp.write(self.web_frame.toHtml().toUtf8())
>
>         print 'End writing'
>
>
>
>         print 'End loadFinished()'
>
>
>>         try:
>
>             self.quit()
>
>         except Exception, e:
>
>             print 'FATAL ERROR: %s' % (str(e), )
>
>
>>
>> class NetworkAccessManager(QNetworkAccessManager):
>
>
>
>     def __init__(self):
>
>         super(NetworkAccessManager, self).__init__()
>
>         # QNetworkAccessManager.__init__(self)
>
>         self.connect(self, SIGNAL('finished (QNetworkReply *)'),
>> self.finishd)
>
>
>
>     def createRequest(self, operation, request, data):
>
>         # url = request.url().toString()
>
>         self.setNetworkAccessible(self.Accessible)
>
>
>
>         return QNetworkAccessManager.createRequest(self, operation,
>> request, data)
>
>
>>     def finishd(self, reply):
>
>         print 'In NetworkAccessManager finishd'
>
>         url = str(reply.url().toString())
>
>
>
>         log = '%s: %s\n' % (time.ctime(), url)
>
>         fp_log.write(log)
>
>
>>         print url
>
>
>>
>> if __name__ == '__main__':
>
>     # url = 'http://product.dangdang.com/product.aspx?product_id=22822333'
>
>     url = 'http://product.dangdang.com/product.aspx?product_id=22848707'
>
>     crawler = Crawler(url)
>
>     sys.exit(crawler.exec_())
>
>
>
-- 
宠辱不惊,闲看庭前花开花落;去留无意,漫随天边云卷云舒。
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.qt-project.org/pipermail/pyside/attachments/20121022/adb11475/attachment.html>


More information about the PySide mailing list