[PySide] using QtWebKit to get all info of web pages including that generated by AJAX

Wed Oct 24 08:06:54 CEST 2012

Thanks anyway.

I find this example<http://webscraping.com/blog/Scraping-JavaScript-webpages-with-webkit/>
.
After modifying this example to crawl one web page which AJAX couldn't be
parsed properly using the previous code, I get the nearly-whole info of the
web page including some info generated by AJAX when scrolling down the
scroll bar.

And the new example code solved another problem of the previous code that
the code could not quit properly. It must be killed using shell command or
use some tricky statements which would caused segment fault.

The new example code works well for one page at a time. When I tried to get
many web pages continually. It could only get part info of the first url,
namely the AJAX of the first url couldn't be parsed and the other urls
couldn't be handled and I got the following exception:

QObject::connect: Cannot connect
>> (null)::configurationAdded(QNetworkConfiguration) to
>> QNetworkConfigurationManager::configurationAdded(QNetworkConfiguration)
>
> QObject::connect: Cannot connect
>> (null)::configurationRemoved(QNetworkConfiguration) to
>> QNetworkConfigurationManager::configurationRemoved(QNetworkConfiguration)
>
> QObject::connect: Cannot connect (null)::configurationUpdateComplete() to
>> QNetworkConfigurationManager::updateCompleted()
>
> QObject::connect: Cannot connect (null)::onlineStateChanged(bool) to
>> QNetworkConfigurationManager::onlineStateChanged(bool)
>
> QObject::connect: Cannot connect
>> (null)::configurationChanged(QNetworkConfiguration) to
>> QNetworkConfigurationManager::configurationChanged(QNetworkConfiguration)
>
> Segmentation fault
>
>
>
I googled this problem and searched this on stackoverflow, but couldn't
find a good solution.

Please give me some tips on this. Thank you in advance.

The following is the new code I wrote:

#!/usr/bin/env python

#coding: utf-8

> import sys

import re

import time

> from PyQt4.QtCore import SIGNAL, QUrl, QSize

from PyQt4.QtGui import QApplication

from PyQt4.QtWebKit import QWebView, QWebPage, QWebFrame, QWebSettings

from PyQt4.QtNetwork import QNetworkAccessManager, QNetworkRequest

> reload(sys)

sys.setdefaultencoding('utf-8')

> fn_url = 'url.txt'

fp_url = open(fn_url, 'ab+')

>
> class Render(QWebPage):

>     def __init__(self, url):

        self.url = url

        self.app = QApplication(sys.argv)

        QWebPage.__init__(self)

        self.viewport = self.setViewportSize(QSize(1600, 9000))

        self.network = NetworkAccessManager()

        self.setNetworkAccessManager(self.network)

>         self.loadFinished.connect(self._loadFinished)

>         QWebSettings.clearMemoryCaches()

>         self.mainFrame().load(QUrl(self.url))

>         self.app.exec_()

>     def _loadFinished(self, res):

        self.frame = self.mainFrame()

        self.app.quit()

        self.deleteLater()

>
> class NetworkAccessManager(QNetworkAccessManager):

    def __init__(self):

        super(NetworkAccessManager, self).__init__()

>         self.connect(self, SIGNAL('finished (QNetworkReply *)'),
> self.finishd)

>         self.ban = (

            '.*\.css',

            '.*\.jpg',

            '.*\.png',

            )

    def createRequest(self, operation, request, data):

        url = str(request.url().toString())

        if re.search('.*\.css', url):

            self.setNetworkAccessible(QNetworkAccessManager.NotAccessible)

        else:

            self.setNetworkAccessible(QNetworkAccessManager.Accessible)

        return QNetworkAccessManager.createRequest(self, operation,
> request, data)

>     def finishd(self, reply):

        print 'In NetworkAccessManager finishd'

        url = str(reply.url().toString())

        log = '%s: %s\n' % (time.ctime(), url)

        fp_url.write(log)

>         print url

>
> if __name__ == '__main__':

    urls_jd = (

        'http://www.360buy.com/product/135896.html',

        'http://www.360buy.com/product/742573.html',

        'http://www.360buy.com/product/724557.html',

        'http://www.360buy.com/product/690189.html',

        'http://www.360buy.com/product/721948.html',

        'http://www.360buy.com/product/722933.html',

        'http://book.360buy.com/10120243.html',

        'http://book.360buy.com/10009164.html',

        'http://book.360buy.com/10875531.html',

        'http://mvd.360buy.com/20003405.html',

        'http://mvd.360buy.com/20064481.html',

        'http://mvd.360buy.com/20063053.html',

        'http://mvd.360buy.com/20061277.html',

        'http://mvd.360buy.com/20006893.html',

        )

>     for url in urls_jd:

        r = Render(url)

        html = r.frame.toHtml().toUtf8()

>         fn = '%s.txt' % (url.split('=')[-1], )

        with open(fn, 'ab+') as fp:

            fp.write(html)

        del r

        print 'File %s' % (fn, )

On Mon, Oct 22, 2012 at 11:52 PM, Srini Kommoori <vasure at gmail.com> wrote:

> Initially I thought you are missing scrolling but looks like you are
> only interested in getting the content either in text or html. I am
> not definite but some websites respond to particular clients really
> well. You can play with client string and see how it behaves.
>
>
> On Mon, Oct 22, 2012 at 1:11 AM, flyer <flyer103 at gmail.com> wrote:
> > I wrote a python script using QtWebKit to get all page info including
> info
> > generated by AJAX requests. I run the following code on CentOS Server
> and do
> > the following settings:
> >
> >>
> >> $ Xvfb :100 -screen 0 9000x15000x24 &
> >>
> >> $ export DISPLAY=:100
> >
> >
> > The following code worked, however, it could only get one-screen info of
> the
> > web page, namely, getting different amount of info according to the
> screen
> > resolution. I could only get part of the info of the webpage.
> >
> > I have tried using selenium and I can get all web info if I set large
> screen
> > resolution using Xvfb .
> >
> > Please give me some tips about how to solve the problem and any manual
> for
> > QtWebKit is also appreciated because I can't find more materials about
> it.
> >
> > The following is my code:
> >
> >>> #!/usr/bin/env python
> >>>
> >>> #coding: utf-8
> >>>
> >>>
> >>>
> >>> import sys
> >>>
> >>> import time
> >>>
> >>>
> >>> from PySide.QtCore import QUrl, SIGNAL
> >>>
> >>> from PySide.QtGui import QApplication
> >>>
> >>> from PySide.QtWebKit import QWebPage, QWebView, QWebSettings
> >>>
> >>> from PySide.QtNetwork import QNetworkAccessManager, QNetworkRequest
> >>>
> >>>
> >>> reload(sys)
> >>>
> >>> sys.setdefaultencoding('utf-8')
> >>>
> >>>
> >>> fn_log = 'url_dd.txt'
> >>>
> >>> fp_log = open(fn_log, 'ab+')
> >>>
> >>>
> >>> class WebPage(QWebPage):
> >>>
> >>>
> >>>     def __init__(self, logger=None, parent=None):
> >>>
> >>>         super(WebPage, self).__init__(parent)
> >>>
> >>>
> >>>
> >>>     def javaScriptConsoleMessage(self, message, lineNumber, sourceID):
> >>>
> >>>         sys.stderr.write('Javascritp error at line number %d\n' %
> >>> (lineNumber))
> >>>
> >>>         sys.stderr.write('%s\n' % (message, ))
> >>>
> >>>         sys.stderr.write('Source ID: %s\n' % (sourceID, ))
> >>>
> >>>
> >>>
> >>> class Crawler(QApplication):
> >>>
> >>>
> >>>
> >>>     def __init__(self, url):
> >>>
> >>>         super(Crawler, self).__init__(sys.argv)
> >>>
> >>>
> >>>
> >>>         self.url = url
> >>>
> >>>         self.web_view = QWebView()
> >>>
> >>>         self.web_page = WebPage()
> >>>
> >>>         self.web_view.setPage(self.web_page)
> >>>
> >>>         self.web_frame = self.web_page.mainFrame()
> >>>
> >>>
> >>>         self.network = NetworkAccessManager()
> >>>
> >>>         self.web_page.setNetworkAccessManager(self.network)
> >>>
> >>>
> >>>
> >>>         self.settings = self.web_page.settings().globalSettings()
> >>>
> >>>         self.settings.setAttribute(QWebSettings.AutoLoadImages, False)
> >>>
> >>>         self.settings.setAttribute(QWebSettings.PluginsEnabled, False)
> >>>
> >>>         QWebSettings.clearMemoryCaches()
> >>>
> >>>
> >>>         self.web_view.resize(1024, 9000)
> >>>
> >>>
> >>>         self.connect(self.web_page, SIGNAL('loadFinished(bool)'),
> >>> self.loadFinished)
> >>>
> >>>
> >>>         print 'Before loading'
> >>>
> >>>         self.web_view.load(QUrl(self.url))
> >>>
> >>>         print 'After loading'
> >>>
> >>>
> >>>     def loadFinished(self, ok):
> >>>
> >>>         print 'Start loadFinished()'
> >>>
> >>>
> >>>         print 'Start writing'
> >>>
> >>>         with open('content_dd.txt', 'ab+') as fp:
> >>>
> >>>             fp.write(self.web_frame.toHtml().toUtf8())
> >>>
> >>>         print 'End writing'
> >>>
> >>>
> >>>
> >>>         print 'End loadFinished()'
> >>>
> >>>
> >>>         try:
> >>>
> >>>             self.quit()
> >>>
> >>>         except Exception, e:
> >>>
> >>>             print 'FATAL ERROR: %s' % (str(e), )
> >>>
> >>>
> >>>
> >>> class NetworkAccessManager(QNetworkAccessManager):
> >>>
> >>>
> >>>
> >>>     def __init__(self):
> >>>
> >>>         super(NetworkAccessManager, self).__init__()
> >>>
> >>>         # QNetworkAccessManager.__init__(self)
> >>>
> >>>         self.connect(self, SIGNAL('finished (QNetworkReply *)'),
> >>> self.finishd)
> >>>
> >>>
> >>>
> >>>     def createRequest(self, operation, request, data):
> >>>
> >>>         # url = request.url().toString()
> >>>
> >>>         self.setNetworkAccessible(self.Accessible)
> >>>
> >>>
> >>>
> >>>         return QNetworkAccessManager.createRequest(self, operation,
> >>> request, data)
> >>>
> >>>
> >>>     def finishd(self, reply):
> >>>
> >>>         print 'In NetworkAccessManager finishd'
> >>>
> >>>         url = str(reply.url().toString())
> >>>
> >>>
> >>>
> >>>         log = '%s: %s\n' % (time.ctime(), url)
> >>>
> >>>         fp_log.write(log)
> >>>
> >>>
> >>>         print url
> >>>
> >>>
> >>>
> >>> if __name__ == '__main__':
> >>>
> >>>     # url =
> >>> 'http://product.dangdang.com/product.aspx?product_id=22822333'
> >>>
> >>>     url = '
> http://product.dangdang.com/product.aspx?product_id=22848707'
> >>>
> >>>     crawler = Crawler(url)
> >>>
> >>>     sys.exit(crawler.exec_())
> >>
> >>
> >
> > --
> > 宠辱不惊，闲看庭前花开花落；去留无意，漫随天边云卷云舒。
> >
> >
> >
> > _______________________________________________
> > PySide mailing list
> > PySide at qt-project.org
> > http://lists.qt-project.org/mailman/listinfo/pyside
> >
>

-- 
宠辱不惊，闲看庭前花开花落；去留无意，漫随天边云卷云舒。
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.qt-project.org/pipermail/pyside/attachments/20121024/febcb7ab/attachment.html>