Initially I thought you are missing scrolling but looks like you are
only interested in getting the content either in text or html. I am
not definite but some websites respond to particular clients really
well. You can play with client string and see how it behaves.

On Mon, Oct 22, 2012 at 1:11 AM, flyer <flyer103 at gmail.com> wrote:
> I wrote a python script using QtWebKit to get all page info including info
> generated by AJAX requests. I run the following code on CentOS Server and do
> the following settings:
>> $ Xvfb :100 -screen 0 9000x15000x24 &
>> $ export DISPLAY=:100
> The following code worked, however, it could only get one-screen info of the
> web page, namely, getting different amount of info according to the screen
> resolution. I could only get part of the info of the webpage.
> I have tried using selenium and I can get all web info if I set large screen
> resolution using Xvfb .
> Please give me some tips about how to solve the problem and any manual for
> QtWebKit is also appreciated because I can't find more materials about it.
> The following is my code:
>>> #!/usr/bin/env python
>>> #coding: utf-8
>>> import sys
>>> import time
>>> from PySide.QtCore import QUrl, SIGNAL
>>> from PySide.QtGui import QApplication
>>> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
>>> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
>>> reload(sys)
>>> sys.setdefaultencoding('utf-8')
>>> fn_log = 'url_dd.txt'
>>> fp_log = open(fn_log, 'ab+')
>>> class WebPage(QWebPage):
>>>     def __init__(self, logger=None, parent=None):
>>>         super(WebPage, self).__init__(parent)
>>>     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
>>>         sys.stderr.write('Javascritp error at line number %d\n' %
>>> (lineNumber))
>>>         sys.stderr.write('%s\n' % (message, ))
>>>         sys.stderr.write('Source ID: %s\n' % (sourceID, ))
>>> class Crawler(QApplication):
>>>     def __init__(self, url):
>>>         super(Crawler, self).__init__(sys.argv)
>>>         self.url = url
>>>         self.web_view = QWebView()
>>>         self.web_page = WebPage()
>>>         self.web_view.setPage(self.web_page)
>>>         self.web_frame = self.web_page.mainFrame()
>>>         self.network = NetworkAccessManager()
>>>         self.web_page.setNetworkAccessManager(self.network)
>>>         self.settings = self.web_page.settings().globalSettings()
>>>         self.settings.setAttribute(QWebSettings.AutoLoadImages, False)
>>>         self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
>>>         QWebSettings.clearMemoryCaches()
>>>         self.web_view.resize(1024, 9000)
>>>         self.connect(self.web_page, SIGNAL('loadFinished(bool)'),
>>> self.loadFinished)
>>>         print 'Before loading'
>>>         self.web_view.load(QUrl(self.url))
>>>         print 'After loading'
>>>     def loadFinished(self, ok):
>>>         print 'Start loadFinished()'
>>>         print 'Start writing'
>>>         with open('content_dd.txt', 'ab+') as fp:
>>>             fp.write(self.web_frame.toHtml().toUtf8())
>>>         print 'End writing'
>>>         print 'End loadFinished()'
>>>         try:
>>>             self.quit()
>>>         except Exception, e:
>>>             print 'FATAL ERROR: %s' % (str(e), )
>>> class NetworkAccessManager(QNetworkAccessManager):
>>>     def __init__(self):
>>>         super(NetworkAccessManager, self).__init__()
>>>         # QNetworkAccessManager.__init__(self)
>>>         self.connect(self, SIGNAL('finished (QNetworkReply *)'),
>>> self.finishd)
>>>     def createRequest(self, operation, request, data):
>>>         # url = request.url().toString()
>>>         self.setNetworkAccessible(self.Accessible)
>>>         return QNetworkAccessManager.createRequest(self, operation,
>>> request, data)
>>>     def finishd(self, reply):
>>>         print 'In NetworkAccessManager finishd'
>>>         url = str(reply.url().toString())
>>>         log = '%s: %s\n' % (time.ctime(), url)
>>>         fp_log.write(log)
>>>         print url
>>> if __name__ == '__main__':
>>>     # url =
>>> 'http://product.dangdang.com/product.aspx?product_id=22822333'
>>>     url = 'http://product.dangdang.com/product.aspx?product_id=22848707'
>>>     crawler = Crawler(url)
>>>     sys.exit(crawler.exec_())
