domingo, 3 de agosto de 2014

Python para Excel: Mi primer Programa.

Sirve para extraer la información desordenada de códigos, descripciones y precios de entre mas de 1000 archivos que tengo en una carpeta:

import openpyxl, xlrd, os.path, re, time, datetime

#first, create a workbook that will contain the merged info.
database = openpyxl.Workbook()

#now create a new worksheet
databaseSheet = database.create_sheet(0)

#rename worksheet.
databaseSheet.title = "Database"

#first and second lines in the worksheet:
databaseSheet.cell(row=1,column=1).value = 'PRICE RECORD OF HUAWEI PRODUCTS IMPORTED FROM 2009'
databaseSheet.cell(row=2,column=1).value ='Part Number'
databaseSheet.cell(row=2,column=2).value = 'Description'
databaseSheet.cell(row=2,column=3).value = 'Quantity'
databaseSheet.cell(row=2,column=4).value = 'Unit Price'
databaseSheet.cell(row=2,column=5).value = 'Source File'
databaseSheet.cell(row=2,column=6).value = 'Date'

rowPaste = 3 #the first row free to start pasting the info
scannedFiles = 0
pathNames = ["d:/rubenbk/confirmar facturas/","d:/rubenbk/database/"]

for path in pathNames:
    #list all files to be searched
    for root, folders, files in os.walk(path):
            for file in files:
                foundColumns = 0 #will count the columns we're looking on each row
                #test if it is an excel file using regular expressions
                match = re.search(r'[^\?]*\w+\.xl\w+', file)
                #if the match object is true: 
                if match:
                    print file
                    #opens each file in folder for reading
                    currentFile = xlrd.open_workbook(os.path.join(root,file))
                    #gets the first sheet for reading
                    currentSheet = currentFile.sheet_by_index(0) #gets the first sheet.
                    #starts iteration on each row and cell in the active sheet
                    #in order to find the columns we will extract the info from.
                    foundColumns = 0
                    for row in range(currentSheet.nrows):
                        for col in range(currentSheet.ncols):
                            #looks for the pattern of Unit Price
                            matchPrice = re.search(r'^[Uu][\w\.\s]*[Pp][Rr][Ii][Cc][Ee].*', unicode(currentSheet.cell_value(row,col)))
                            #looks for part number
                            matchPart = re.search(r'[Pp][Aa][Rr][Tt]\s[Nn][\s\S\W.]*', unicode(currentSheet.cell_value(row,col)))
                            #looks for description
                            matchDescription = re.search(r'[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]', unicode(currentSheet.cell_value(row,col)))
                            #looks for quantity
                            matchQty = re.search(r'[Qq][^Ll\s]*[Tt][Yy]', unicode(currentSheet.cell_value(row,col)))
                            #if any column was found, it adds to the foundColumns counter.
                            if matchPart:
                                #print "Part matched"
                                #print matchPart.group()
                                partCol = col
                                foundColumns +=1
                            if matchDescription:
                                #print 'Description matched'
                                #print matchDescription.group()
                                descriptionCol = col
                                foundColumns +=1
                            if matchQty:
                                #print 'Qty matched'
                                #print matchQty.group()
                                qtyCol = col
                                foundColumns +=1
                            if matchPrice:
                                #print 'Price matched'
                                #print matchPrice.group()
                                priceCol = col
                                foundColumns +=1
                        #if we found the four columns in the searched row:
                        #if foundColumns > 1: print foundColumns
                        if foundColumns == 4:
                            #print "found 4 columns!"
                            scannedFiles +=1
                            #we start getting the info from the rows below the one were we found the 4 variables
                            for rowCopy in range(row+1, currentSheet.nrows): #fromw the next row till the last in this document.
                                partValue = currentSheet.cell_value(rowCopy,partCol)
                                descValue = currentSheet.cell_value(rowCopy, descriptionCol)
                                qtyValue = currentSheet.cell_value(rowCopy, qtyCol)
                                priceValue = currentSheet.cell_value(rowCopy, priceCol)
                                #now we start writing the info in the database:
                                #if not currentSheet.cell_type(rowCopy,qtyCol) in (xlrd.XL_CELL_BLANK, xlrd.XL_CELL_EMPTY):
                                #if not (currentSheet.cell_type(rowCopy,partCol) or currentSheet.cell_type(rowCopy,qtyCol)
                                #or currentSheet.cell_type(rowCopy,descriptionCol) or currentSheet.cell_type(rowCopy,priceCol)) in (xlrd.XL_CELL_BLANK, xlrd.XL_CELL_EMPTY):
                                #if not (currentSheet.cell_value(rowCopy,partCol)=='' or currentSheet.cell_value(rowCopy,qtyCol) == ''
                                #or currentSheet.cell_value(rowCopy,descriptionCol)== '' or currentSheet.cell_value(rowCopy,priceCol) == ''):
                                if not (currentSheet.cell_value(rowCopy,qtyCol) == ''
                                or currentSheet.cell_value(rowCopy,descriptionCol)== '' or currentSheet.cell_value(rowCopy,priceCol) == ''):

                                    databaseSheet.cell(row=rowPaste,column= 1).value = partValue
                                    databaseSheet.cell(row=rowPaste,column= 2).value = descValue
                                    databaseSheet.cell(row=rowPaste,column= 3).value = qtyValue
                                    databaseSheet.cell(row=rowPaste,column= 4).value = priceValue
                                    databaseSheet.cell(row=rowPaste,column= 5).value = file
                                    databaseSheet.cell(row=rowPaste,column= 6).value = time.strftime('%d-%m-%Y',time.gmtime(os.path.getctime(os.path.join(root,file))))
                                    rowPaste +=1
                            break #we break the second for loop because it is now unnecessary.
                            foundColumns = 0
print "Total scanned files: %d" %scannedFiles
print 'Formatting database...'
for rowFinal in range(3,databaseSheet.get_highest_row()):
    fileValue = str(databaseSheet.cell(row=rowFinal, column = 5).value)
    if 'HS code' in fileValue:
        for columnFinal in range(1, 6):
            databaseSheet.cell(row=rowFinal, column=columnFinal).value =  databaseSheet.cell(row=rowFinal + 1, column=columnFinal).value
print 'Saving database...'    
print 'Done.'

No hay comentarios:

Publicar un comentario