import openpyxl, xlrd, os.path, re, time, datetime
#first, create a workbook that will contain the merged info.
database = openpyxl.Workbook()
#now create a new worksheet
databaseSheet = database.create_sheet(0)
#rename worksheet.
databaseSheet.title = "Database"
#first and second lines in the worksheet:
databaseSheet.cell(row=1,column=1).value = 'PRICE RECORD OF HUAWEI PRODUCTS IMPORTED FROM 2009'
databaseSheet.cell(row=2,column=1).value ='Part Number'
databaseSheet.cell(row=2,column=2).value = 'Description'
databaseSheet.cell(row=2,column=3).value = 'Quantity'
databaseSheet.cell(row=2,column=4).value = 'Unit Price'
databaseSheet.cell(row=2,column=5).value = 'Source File'
databaseSheet.cell(row=2,column=6).value = 'Date'
rowPaste = 3 #the first row free to start pasting the info
scannedFiles = 0
pathNames = ["d:/rubenbk/confirmar facturas/","d:/rubenbk/database/"]
for path in pathNames:
#list all files to be searched
for root, folders, files in os.walk(path):
for file in files:
foundColumns = 0 #will count the columns we're looking on each row
#test if it is an excel file using regular expressions
match = re.search(r'[^\?]*\w+\.xl\w+', file)
#if the match object is true:
if match:
print file
#opens each file in folder for reading
currentFile = xlrd.open_workbook(os.path.join(root,file))
#gets the first sheet for reading
currentSheet = currentFile.sheet_by_index(0) #gets the first sheet.
#starts iteration on each row and cell in the active sheet
#in order to find the columns we will extract the info from.
foundColumns = 0
for row in range(currentSheet.nrows):
for col in range(currentSheet.ncols):
#looks for the pattern of Unit Price
matchPrice = re.search(r'^[Uu][\w\.\s]*[Pp][Rr][Ii][Cc][Ee].*', unicode(currentSheet.cell_value(row,col)))
#looks for part number
matchPart = re.search(r'[Pp][Aa][Rr][Tt]\s[Nn][\s\S\W.]*', unicode(currentSheet.cell_value(row,col)))
#looks for description
matchDescription = re.search(r'[Dd][Ee][Ss][Cc][Rr][Ii][Pp][Tt][Ii][Oo][Nn]', unicode(currentSheet.cell_value(row,col)))
#looks for quantity
matchQty = re.search(r'[Qq][^Ll\s]*[Tt][Yy]', unicode(currentSheet.cell_value(row,col)))
#if any column was found, it adds to the foundColumns counter.
if matchPart:
#print "Part matched"
#print matchPart.group()
partCol = col
foundColumns +=1
if matchDescription:
#print 'Description matched'
#print matchDescription.group()
descriptionCol = col
foundColumns +=1
if matchQty:
#print 'Qty matched'
#print matchQty.group()
qtyCol = col
foundColumns +=1
if matchPrice:
#print 'Price matched'
#print matchPrice.group()
priceCol = col
foundColumns +=1
#if we found the four columns in the searched row:
#if foundColumns > 1: print foundColumns
if foundColumns == 4:
#print "found 4 columns!"
scannedFiles +=1
#we start getting the info from the rows below the one were we found the 4 variables
for rowCopy in range(row+1, currentSheet.nrows): #fromw the next row till the last in this document.
partValue = currentSheet.cell_value(rowCopy,partCol)
descValue = currentSheet.cell_value(rowCopy, descriptionCol)
qtyValue = currentSheet.cell_value(rowCopy, qtyCol)
priceValue = currentSheet.cell_value(rowCopy, priceCol)
#now we start writing the info in the database:
#if not currentSheet.cell_type(rowCopy,qtyCol) in (xlrd.XL_CELL_BLANK, xlrd.XL_CELL_EMPTY):
#if not (currentSheet.cell_type(rowCopy,partCol) or currentSheet.cell_type(rowCopy,qtyCol)
#or currentSheet.cell_type(rowCopy,descriptionCol) or currentSheet.cell_type(rowCopy,priceCol)) in (xlrd.XL_CELL_BLANK, xlrd.XL_CELL_EMPTY):
#if not (currentSheet.cell_value(rowCopy,partCol)=='' or currentSheet.cell_value(rowCopy,qtyCol) == ''
#or currentSheet.cell_value(rowCopy,descriptionCol)== '' or currentSheet.cell_value(rowCopy,priceCol) == ''):
if not (currentSheet.cell_value(rowCopy,qtyCol) == ''
or currentSheet.cell_value(rowCopy,descriptionCol)== '' or currentSheet.cell_value(rowCopy,priceCol) == ''):
databaseSheet.cell(row=rowPaste,column= 1).value = partValue
databaseSheet.cell(row=rowPaste,column= 2).value = descValue
databaseSheet.cell(row=rowPaste,column= 3).value = qtyValue
databaseSheet.cell(row=rowPaste,column= 4).value = priceValue
databaseSheet.cell(row=rowPaste,column= 5).value = file
databaseSheet.cell(row=rowPaste,column= 6).value = time.strftime('%d-%m-%Y',time.gmtime(os.path.getctime(os.path.join(root,file))))
rowPaste +=1
break #we break the second for loop because it is now unnecessary.
else:
foundColumns = 0
continue
print "Total scanned files: %d" %scannedFiles
print 'Formatting database...'
for rowFinal in range(3,databaseSheet.get_highest_row()):
fileValue = str(databaseSheet.cell(row=rowFinal, column = 5).value)
if 'HS code' in fileValue:
for columnFinal in range(1, 6):
databaseSheet.cell(row=rowFinal, column=columnFinal).value = databaseSheet.cell(row=rowFinal + 1, column=columnFinal).value
print 'Saving database...'
database.save('D:/rubenbk/reports/pricesDatabase.xlsx')
print 'Done.'
No hay comentarios:
Publicar un comentario