Python链家爬虫

业余时间做了个链家的爬虫,爬取数据写入sqlite,方便浏览和对比。
具体参考了冰蓝大牛的博客http://lanbing510.info/2016/03/15/Lianjia-Spider.html?utm_source=tuicool&utm_medium=referral, 根据链家最新的web样式做了修改(2017-03)爬在售的二手房的数据。

链家对短时间内同一个IP的流量有监控,所以如果用多线程去爬,太快可能会被要求输入验证码。试了以下用代理ip池去爬,因为可用的免费代理ip不多也不稳定,就放弃了。后面想数据爬的也不多,就加个延时模拟人为慢慢爬了,另外还有个问题可能是单位时间内Request太快可能解析不到数据,就把延时放在BeautifulSoup解析后面,并加了如果没有数据则重复5次请求的验证过程,才成功抓全数据:

time.sleep(np.random.rand()*3+3)

这里贴出关键的匹配代码,代码好粗糙,仅供发参考:)

def onsell_spider(mydb,url_page=u"http://gz.lianjia.com/ershoufang/pg1rs越秀/",area=u"越秀"):
    # time.sleep(np.random.rand()*1)
    print url_page
    counts = 0
    trytime = 0
    while counts==0 & trytime<=5:
        try:
            req = urllib2.Request(url_page,headers=hds[random.randint(0,len(hds)-1)])
            source_code = urllib2.urlopen(req,timeout=10).read()
            plain_text=unicode(source_code)#,errors='ignore')   
            soup = BeautifulSoup(plain_text, "lxml")
        except (urllib2.HTTPError, urllib2.URLError), e:
            print e
            exception_write('onsell_spider',url_page)
            return
        except Exception,e:
            print e
            exception_write('onsell_spider',url_page)
            return
        time.sleep(np.random.rand()*3+3)
        cj_list=soup.findAll('div',{'class':'info clear'})

        print len(cj_list)
        counts = len(cj_list)
        trytime = trytime + 1
        for cj in cj_list:
            info_dict={}
            href=cj.find('a')
            if not href:
                continue
            info_dict.update({u'链接':href.attrs['href']})
            name=cj.find('a').text
            info_dict.update({u'标题':name})

     #href TEXT primary key UNIQUE, name TEXT, community TEXT, style TEXT, area TEXT, orientation TEXT,decoration TEXT,haslift TEXT,floor TEXT, year TEXT, bplace TEXT,splace TEXT, unit_price TEXT, total_price TEXT, subway TEXT, other TEXT

            content=unicode(cj.find('div',{'class':'houseInfo'}).renderContents().strip())
            info=re.match(r"<span .*></span><a .*>(.*)</a>(.*)", content)

            # print info
            if info:
                info=info.groups()
                info_dict.update({u'小区':info[0]})

                str = info[1].strip().split('|')
                # print str[1]

                try:
                    info_dict.update({u'户型':str[1].strip()})
                except Exception,e:
                    info_dict.update({u'户型':''})
                try:
                    info_dict.update({u'面积':str[2].strip()})
                except Exception,e:
                    info_dict.update({u'面积':''})
                try:
                    info_dict.update({u'朝向':str[3].strip()})
                except Exception,e:
                    info_dict.update({u'朝向':''})
                try:
                    info_dict.update({u'装修':str[4].strip()})
                except Exception,e:
                    info_dict.update({u'装修':''})
                try:
                    info_dict.update({u'有无电梯':str[5].strip()})
                except Exception,e:
                    info_dict.update({u'有无电梯':''})

            content=unicode(cj.find('div',{'class':'positionInfo'}).renderContents().strip())
            info=re.match(r"<span .*></span>(.*)\)(.*)<a .*>(.*)</a>", content)
            if info:
                info=info.groups()
                # print info
                info_dict.update({u'楼层':info[0]})
                info_dict.update({u'建造时间':info[1]})
                info_dict.update({u'大区域':area})
                try:
                    info_dict.update({u'小区域':info[2]})
                except Exception,e:
                    info_dict.update({u'小区域':info[2]})


            content=cj.find('div',{'class':'unitPrice'}).find('span').text
            if content:
                info_dict.update({u'单价':content})
            content=cj.find('div',{'class':'totalPrice'}).find('span').text
            if content:
                info_dict.update({u'总价':content})

            content=cj.find('span',{'class':'subway'})
            # print content
            if content:
                try:
                    info_dict.update({u'地铁':content.text})
                except Exception,e:
                    info_dict.update({u'地铁':''})

            content=cj.find('div',{'class':'followInfo'}).text
            if  content:
                info_dict.update({u'其他':content})

            command=sql_onsell_insert_command(info_dict)
            mydb.execute(command,1)


def do_onsell_spider(mydb,area=u"越秀"):

    url=u"http://gz.lianjia.com/ershoufang/pg%drs%s/" % (1,area)

    try:
        req = urllib2.Request(url,headers=hds[random.randint(0,len(hds)-1)])
        source_code = urllib2.urlopen(req,timeout=10).read()
        plain_text=unicode(source_code)#,errors='ignore')   
        soup = BeautifulSoup(plain_text, "lxml")
    except (urllib2.HTTPError, urllib2.URLError), e:
        print e
        exception_write('do_onsell_spider',area)
        return
    except Exception,e:
        print e
        exception_write('do_onsell_spider',area)
        return

    time.sleep(np.random.rand()*1+1)
    content=soup.find('div',{'class':'page-box house-lst-page-box'})
    # print soup

    if content:
        d="d="+content.get('page-data')
        exec(d)
        total_pages=d['totalPage']

    print total_pages

    for i in range(total_pages):
        time.sleep(np.random.rand()*1)
        url_page=u"http://gz.lianjia.com/ershoufang/pg%drs%s/" % (i+1,area)
        onsell_spider(mydb,url_page,area)

对BeautifulSoup的方法还不太熟悉,用的都是简单粗暴的方法,后续再去细看了。

爬下来的数据: