Failing to right to DB

So I have this script working again using the indexing error a few posts down.  Now the script runs start to finish, but doesn't seem to parse the saved html files using beautifulsoup properly.  I am receiving a list index out of error when trying to write to the DB:

 Writing 0 records to table: photosLiked
[*] Writing 0 record(s) to database table: photosLiked
list index out of range

This is consistent for all categories.  I am assuming that the way the file is getting parsed is no longer meeting the list lengths specified in the code, but I'm not sure how to validate my assumption.

Here is the code section defining the columns:

"""def write2Database(dbName,dataList):
    try:
        cprint("[_] Writing "+str(len(dataList))+" record(s) to database table: "+dbName,"white")
        #print "[_] Writing "+str(len(dataList))+" record(s) to database table: "+dbName
        numOfColumns = len(dataList[0])
        c = conn.cursor()
        if numOfColumns==3:
            for i in dataList:
                try:
                    c.execute('INSERT INTO '+dbName+' VALUES (?,?,?)', i)
                    conn.commit()
                except sqlite3.IntegrityError:
                    continue
        if numOfColumns==4:
            for i in dataList:
                try:
                    c.execute('INSERT INTO '+dbName+' VALUES (?,?,?,?)', i)
                    conn.commit()
                except sqlite3.IntegrityError:
                    continue
        if numOfColumns==5:
            for i in dataList:
                try:
                    c.execute('INSERT INTO '+dbName+' VALUES (?,?,?,?,?)', i)
                    conn.commit()
                except sqlite3.IntegrityError:
                    continue
        if numOfColumns==9:
            for i in dataList:
                try:
                    c.execute('INSERT INTO '+dbName+' VALUES (?,?,?,?,?,?,?,?,?)', i)
                    conn.commit()
                except sqlite3.IntegrityError:
                    continue
    except TypeError as e:
        print e
        pass
    except IndexError as e:
        print e
        pass"""

Example of the parsing functions:

"""def parsePhotosOf(html):
    soup = BeautifulSoup(html)  
    photoPageLink = soup.findAll("a", {"class" : "_23q"})
    tempList = []
    for i in photoPageLink:
        html = str(i)
        soup1 = BeautifulSoup(html)
        pageName = soup1.findAll("img", {"class" : "img"})
        pageName1 = soup1.findAll("img", {"class" : "scaledImageFitWidth img"})
        pageName2 = soup1.findAll("img", {"class" : "_46-i img"})  
        for z in pageName2:
            if z['src'].endswith('.jpg'):
                url1 = i['href']
                r = re.compile('fbid=(.*?)&set=bc')
                m = r.search(url1)
                if m:
                    filename = 'fbid_'+ m.group(1)+'.html'
                    filename = filename.replace("profile.php?id=","")
                    if not os.path.lexists(filename):
                        #html1 = downloadPage(url1)
                        html1 = downloadFile(url1)
                        print "[_] Caching Photo Page: "+m.group(1)
                        text_file = open(filename, "w")
                        text_file.write(normalize(html1))
                        text_file.close()
                    else:
                        html1 = open(filename, 'r').read()
                soup2 = BeautifulSoup(html1)
                username2 = soup2.find("div", {"class" : "fbPhotoContributorName"})
                r = re.compile('a href="(._?)"')
                m = r.search(str(username2))
                if m:  
                    username3 = m.group(1)
                    username3 = username3.replace("https://www.facebook.com/","")
                    username3 = username3.replace("profile.php?id=","")
                    print "[_] Extracting Data from Photo Page: "+username3
                    tempList.append([str(uid),z['alt'],z['src'],i['href'],username3])
        for y in pageName1:
            if y['src'].endswith('.jpg'):
                url1 = i['href']
                r = re.compile('fbid=(._?)&set=bc')
                m = r.search(url1)
                if m:
                    filename = 'fbid_'+ m.group(1)+'.html'
                    filename = filename.replace("profile.php?id=","")
                    if not os.path.lexists(filename):
                        #html1 = downloadPage(url1)
                        html1 = downloadFile(url1)
                        print "[_] Caching Photo Page: "+m.group(1)
                        text_file = open(filename, "w")
                        text_file.write(normalize(html1))
                        text_file.close()
                    else:
                        html1 = open(filename, 'r').read()
                soup2 = BeautifulSoup(html1)
                username2 = soup2.find("div", {"class" : "fbPhotoContributorName"})
                r = re.compile('a href="(._?)"')
                m = r.search(str(username2))
                if m:  
                    username3 = m.group(1)
                    username3 = username3.replace("https://www.facebook.com/","")
                    username3 = username3.replace("profile.php?id=","")
                    print "[_] Extracting Data from Photo Page: "+username3
                    tempList.append([str(uid),y['alt'],y['src'],i['href'],username3])
        for x in pageName:
            if x['src'].endswith('.jpg'):
                url1 = i['href']
                r = re.compile('fbid=(._?)&set=bc')
                m = r.search(url1)
                if m:
                    filename = 'fbid_'+ m.group(1)+'.html'
                    filename = filename.replace("profile.php?id=","")
                    if not os.path.lexists(filename):
                        #html1 = downloadPage(url1)
                        html1 = downloadFile(url1)
                        print "[_] Caching Photo Page: "+m.group(1)
                        text_file = open(filename, "w")
                        text_file.write(normalize(html1))
                        text_file.close()
                    else:
                        html1 = open(filename, 'r').read()
                soup2 = BeautifulSoup(html1)
                username2 = soup2.find("div", {"class" : "fbPhotoContributorName"})
                r = re.compile('a href="(._?)"')
                m = r.search(str(username2))
                if m:  
                    username3 = m.group(1)
                    username3 = username3.replace("https://www.facebook.com/","")
                    username3 = username3.replace("profile.php?id=","")
                    print "[*] Extracting Data from Photo Page: "+username3
                    tempList.append([str(uid),x['alt'],x['src'],i['href'],username3])"""


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Failing to right to DB #27

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Failing to right to DB #27

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions