feat: searching data in excel files using new algorithm (massive performance improvement)
Some checks failed
Build / Build-and-ng-test (pull_request) Failing after 45s

This commit is contained in:
Mihajlo Medjedovic 2024-08-07 17:53:52 +02:00
parent 403d08c86a
commit bbb725c64c
5 changed files with 262 additions and 354 deletions

View File

@ -109,13 +109,8 @@ context('excel tests: ', function () {
openTableFromTree(libraryToOpenIncludes, 'mpe_x_test')
attachExcelFile('duplicate_column_excel.xlsx', () => {
cy.get('.abortMsg', { timeout: longerCommandTimeout })
.should('exist')
.then((elements: any) => {
if (elements[0]) {
if (elements[0].innerText.toLowerCase().includes('missing')) done()
}
})
submitExcel()
rejectExcel(done)
})
})

View File

@ -1,5 +1,5 @@
import { FileUploader } from './FileUploader.class'
import SheetInfo from './SheetInfo'
import FoundRangeInfo from './RangeInfo'
export interface ParseResult {
/**
@ -10,6 +10,6 @@ export interface ParseResult {
* In case of CSV file, won't be returned
*/
headerShow?: string[]
rangeSheetRes?: SheetInfo
rangeSheetRes?: FoundRangeInfo
uploader: FileUploader
}

View File

@ -0,0 +1,13 @@
export default interface FoundRangeInfo {
found: boolean
sheetName: string
rangeStartAddress: string
rangeEndAddress: string
rangeAddress: string
missingHeaders: MissingHeaders[]
}
export interface MissingHeaders {
sheetName: string
missingHeaders: string[]
}

View File

@ -0,0 +1,13 @@
import { MissingHeaders } from "./RangeInfo"
export interface SearchDataExcelResult {
missing?: MissingHeaders[],
found?: {
data: any,
arrayData: any[],
sheetName: string
headers: string[]
startAddress?: string
endAddress?: string
}
}

View File

@ -31,6 +31,8 @@ import * as iconv from 'iconv-lite'
* Which will be send to backend
*/
import { Buffer } from 'buffer'
import { MissingHeaders } from 'src/app/models/RangeInfo'
import { SearchDataExcelResult } from 'src/app/models/SearchDataExcelResult.interface'
type AOA = any[][]
export interface ConstructorParams {
@ -61,12 +63,6 @@ export class SpreadsheetUtil {
onTableFoundEvent?: (info: string) => void
): Promise<ParseResult | undefined> {
return new Promise((resolve, reject) => {
// If file size is bigger then 2 MB we need to load its bytes in chunks
const sizeInMB = this.bytesToMB(parseParams.file.size)
if (sizeInMB > 2) {
}
let data: any[] = []
const uploader: FileUploader = parseParams.uploader || new FileUploader()
@ -153,7 +149,7 @@ export class SpreadsheetUtil {
/* save data */
let isComplete: boolean = false
let missingHeaders: string[] = []
let missingHeaders: MissingHeaders[] = []
const csvArrayHeaders: string[] = [
'_____DELETE__THIS__RECORD_____',
@ -169,83 +165,39 @@ export class SpreadsheetUtil {
{}
)
let csvArrayData: any[] = []
const rangeSheetRes: SheetInfo = this.getRangeAndSheet(
const searchResult = this.searchDataInExcel(
wb,
parseParams
)
missingHeaders = rangeSheetRes.missingHeaders
if (rangeSheetRes.foundData) {
let csvArrayData: any[] = []
if (searchResult.found) {
isComplete = true
csvArrayHeadersMap = rangeSheetRes.csvArrayHeadersMap
const ws: XLSX.WorkSheet = wb.Sheets[rangeSheetRes.sheetName]
csvArrayData = searchResult.found.arrayData
if (onParseStateChange)
onParseStateChange(
`Table found on sheet ${rangeSheetRes.sheetName} on row ${rangeSheetRes.startRow}`
)
if (!searchResult.found.headers.includes('_____delete__this__record_____')) {
csvArrayData = csvArrayData.map((row: any[]) => {
// Add empty val on start of the column to compensate for _____delete__this__record_____
// when not found in the file
row.unshift({ v: '' })
let startAddress = ''
let endAddress = ''
for (
let row = rangeSheetRes.startRow;
row < rangeSheetRes.endRow;
++row
) {
const arr: any[] = []
csvArrayHeadersLower.forEach((x) => {
const col = csvArrayHeadersMap[x]
const addr = XLSX.utils.encode_cell({
r: rangeSheetRes.rangeStartRow + row,
c: rangeSheetRes.rangeStartCol + col
})
let cell
if (!ws[addr]) {
cell = { v: '' }
} else {
cell = ws[addr]
}
if (startAddress === '' && ws[addr]) startAddress = addr
endAddress = addr
arr.push(cell)
return row
})
// If we found at least one non empty value it means it is not empty row
// othervise, it is empty row
let arrNonEmptyValue = arr.find((x) => x.v !== '')
if (arrNonEmptyValue) csvArrayData.push(arr)
}
rangeSheetRes.rangeAddress = `${startAddress}:${endAddress}`
if (onTableFoundEvent)
onTableFoundEvent(
`Sheet: ${rangeSheetRes.sheetName}\nRange: ${rangeSheetRes.rangeAddress}`
`Sheet: ${searchResult.found.sheetName}\nRange: ${searchResult.found.startAddress}:${searchResult.found.endAddress}`
)
} else {
missingHeaders = rangeSheetRes.missingHeaders
missingHeaders = searchResult.missing || []
}
if (missingHeaders.length > 0) {
missingHeaders.sort(function compareSecondColumn(a, b) {
if (a[1] === b[1]) {
return 0
} else {
return a[1] > b[1] ? -1 : 1
}
})
let abortMsg = missingHeaders
.map((x) => x[0])
.slice(0, 5)
.join('\n')
let abortMsg = missingHeaders.map(mh => {
return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
}).join('\n\n')
uploader.queue.pop()
return reject(abortMsg)
@ -285,17 +237,9 @@ export class SpreadsheetUtil {
if (missingHeaders.length === 0) {
abortMsg = 'No relevant data found in File !'
} else {
missingHeaders.sort(function compareSecondColumn(a, b) {
if (a[1] === b[1]) {
return 0
} else {
return a[1] > b[1] ? -1 : 1
}
})
abortMsg = missingHeaders
.map((x) => x[0])
.slice(0, 5)
.join('\n')
abortMsg = missingHeaders.map(mh => {
return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
}).join('\n\n')
}
// abort message is fired, return undefined
@ -303,8 +247,18 @@ export class SpreadsheetUtil {
return reject(abortMsg)
} else {
parseParams.headerShow = csvArrayHeaders
// Remove the metadata from the cells, leave only values
csvArrayData = csvArrayData.map((row: any) =>
row.map((col: any) => (col.t === 'n' ? col.v : col.w))
row.map((col: any) => {
if (col.t === 'n') {
return col.v
} else {
if (col.w) return col.v
return typeof col.v === 'string' ? col.v.trim() : col.v
}
})
)
csvArrayData = csvArrayData.map((row: any) => {
@ -394,10 +348,26 @@ export class SpreadsheetUtil {
)
}
if (!searchResult.found) {
return reject(
`No relevant data found. 'found' object is empty, unexpected error occurred.`
)
}
const rangeStartAddress = searchResult.found.startAddress || ''
const rangeEndAddress = searchResult.found.endAddress || ''
return resolve({
uploader,
data,
rangeSheetRes,
data: csvArrayData,
rangeSheetRes: {
found: !!searchResult.found,
sheetName: searchResult.found.sheetName,
rangeStartAddress: rangeStartAddress,
rangeEndAddress: rangeEndAddress,
rangeAddress: `${rangeStartAddress}:${rangeEndAddress}`,
missingHeaders: missingHeaders,
},
headerShow: parseParams.headerShow
})
}
@ -425,7 +395,7 @@ export class SpreadsheetUtil {
uploader.queue.pop()
uploader.addToQueue([encodedFile])
resolve({
return resolve({
uploader
})
}
@ -515,24 +485,17 @@ export class SpreadsheetUtil {
})
}
/**
* Function that gives the sheet name which contains data and range of data in that sheet, if some headers are missing then also gives the info about those missing headers
* @param wb Excel workbook
* @returns {object: SheetInfo} an object which contains necessary information about workbook that which sheet contains required data and what's the range
*/
private getRangeAndSheet(
private searchDataInExcel(
wb: XLSX.WorkBook,
parseParams: ParseParams
): SheetInfo {
let data = []
let rangeStartRow: number = 0
let rangeStartCol: number = 0
let startRow: number = -1
let endRow: number = -1
): SearchDataExcelResult {
let headerStartAddress: string | undefined
let headerEndAddress: string | undefined
let rangeStartAddress: string | undefined
let rangeEndAddress: string | undefined
let sheetName: string = ''
let isComplete = false
let missingHeaders: string[] = []
let missing: MissingHeaders[] = []
const csvArrayHeaders: string[] = [
'_____DELETE__THIS__RECORD_____',
...parseParams.headerArray
@ -546,284 +509,208 @@ export class SpreadsheetUtil {
{}
)
wb.SheetNames.forEach((element: string) => {
// Checking for required data in each sheet in workbook/
if (isComplete) {
return
}
for (let sheet of wb.SheetNames) {
headerStartAddress = undefined
headerEndAddress = undefined
missingHeaders = []
sheetName = element
csvArrayHeadersMap = csvArrayHeadersLower.reduce(
(map: any, obj: string) => {
map[obj] = -1
return map
},
{}
)
let missingHeaders = []
sheetName = sheet
const ws: XLSX.WorkSheet = wb.Sheets[sheetName]
data = <AOA>XLSX.utils.sheet_to_json(ws, {
header: 1,
blankrows: true, // Without empty rows, if another table is below a table separated by the empty rows, startRow index is wrong
defval: ''
// Find the first header
Object.keys(ws).forEach(wsKey => {
const cellValue = ws[wsKey].v
// If the cell does not have `v` property we ignore it, those are metadata properties
if (cellValue && typeof cellValue === 'string') {
const potentialHeader = cellValue.toLowerCase()
const headerIndex = csvArrayHeadersLower.indexOf(potentialHeader)
if (headerIndex > -1) {
csvArrayHeadersMap[potentialHeader] = wsKey
if (!headerStartAddress) headerStartAddress = wsKey
// Update on every found header, until the end, which will leave
// last found header address
headerEndAddress = wsKey
}
}
})
if (data.length <= 1) {
return
// If _____delete__this__record_____ is not found in the file, remove it from the array
if (csvArrayHeadersMap['_____delete__this__record_____'] === -1) delete csvArrayHeadersMap['_____delete__this__record_____']
// Parse missing headers, if any, abort the search and jump to next sheet
missingHeaders = Object.keys(csvArrayHeadersMap).filter(header => csvArrayHeadersMap[header] === -1)
if (missingHeaders.length > 0) {
missing.push({
sheetName: sheetName,
missingHeaders: missingHeaders.map(header => header.toUpperCase())
})
continue
}
let tempArr: string[] = []
parseParams.headerArray.forEach(() => tempArr.push(''))
data.push(tempArr)
// If no headers are missing, start parsing the data column by column
const foundHeaders = Object.keys(csvArrayHeadersMap)
let foundHeaders = false
let json: any = []
let arrayData: any = []
let endRow: number
data.forEach((row: any, index: number) => {
if (isComplete) {
return
}
// Sort the headers so first headers are primary key columns
const foundHeadersSorted = foundHeaders.sort((a: string, b: string) => {
const aIsPk = parseParams.headerPks.includes(a) ? 1 : 0
const bIsPk = parseParams.headerPks.includes(b) ? 1 : 0
if (foundHeaders) {
let isDataEnd = true
let isPkNull = false
return bIsPk - aIsPk
})
csvArrayHeadersLower.forEach((x) => {
const col = csvArrayHeadersMap[x]
foundHeadersSorted.forEach(header => {
const headerAddress = csvArrayHeadersMap[header]
const headerAddressLetterRegex = headerAddress.match(/\D+/)
const headerAddressNumberRegex = headerAddress.match(/\d+/)
if (row[col] !== '' && row[col] !== undefined) {
isDataEnd = false
} else {
if (parseParams.headerPks.indexOf(x.toUpperCase()) !== -1) {
isPkNull = true
const headerAddressLetter = (headerAddressLetterRegex ? headerAddressLetterRegex[0] : -1) || -1
const headerAddressNumber = parseInt((headerAddressNumberRegex ? headerAddressNumberRegex[0] : -1) || -1)
const firstDataRow = headerAddressNumber + 1
let jsonRow = 0
// If end row found, use it as a limit
if (endRow) {
for (let row = firstDataRow; row <= endRow; row++) {
const address = `${headerAddressLetter}${row}`
const cell = ws[address]
if (parseParams.headerPks.includes(header)) {
// If this column is primary key and has less rows, set new endRow
if (cell === undefined || cell.v === undefined) {
endRow = row
break
}
}
})
if (isDataEnd || isPkNull) {
endRow = index
isComplete = true
} else {
if (startRow === -1) {
startRow = index
}
// Push to array of objects
if (!json[jsonRow]) json.push({})
if (cell) json[jsonRow][header] = typeof cell.v === 'string' ? cell.v.trim() : cell.v
// Push to array of arrays, but with all cell meta info
if (!arrayData[jsonRow]) arrayData.push([])
arrayData[jsonRow].push(cell ?? { v: '' })
jsonRow++
}
} else {
const rowLowerCase: string[] = row.map((x: any) =>
x.toString().toLowerCase()
)
// If end row not found, go trough rows until empty PK row appears
let cellsRow = firstDataRow
// If in file there is no delete column, remove it from search of missing.
// This way delete column will be optional to provide in file
if (!rowLowerCase.includes('_____delete__this__record_____')) {
const deleteIndex = csvArrayHeadersLower.indexOf(
'_____delete__this__record_____'
)
while (endRow === undefined) {
const address = `${headerAddressLetter}${cellsRow}`
const cell = ws[address]
if (deleteIndex > -1) csvArrayHeadersLower.splice(deleteIndex, 1)
}
foundHeaders = true
csvArrayHeadersLower.forEach((x) => {
if (rowLowerCase.indexOf(x) === -1) {
foundHeaders = false
}
})
let result = []
result = this.findValidHeaders(
rowLowerCase,
csvArrayHeadersLower,
index,
sheetName,
parseParams
)
if (result[0] === false) {
foundHeaders = false
if (result[1].length > 0) {
result[1].forEach((data: string) => {
missingHeaders.push(data)
})
}
} else {
csvArrayHeadersMap = result[1]
}
}
})
if (isComplete) {
this.update_sheet_range(ws)
const worksheetSel = ws['!ref']
if (worksheetSel) {
const range = XLSX.utils.decode_range(ws['!ref'] || '')
rangeStartRow = range.s.r
rangeStartCol = range.s.c
}
}
})
// If start row is still -1 that means first row of found range is empty
if (startRow === -1) isComplete = false
const returnObj: SheetInfo = {
foundData: isComplete,
sheetName,
startRow,
endRow,
csvArrayHeadersMap,
missingHeaders,
rangeStartRow,
rangeStartCol
}
return returnObj
}
private findValidHeaders(
row: string[],
headers: string[],
rowNumber: number,
tabName: string,
parseParams: ParseParams
): Array<any> {
let headersFound = false
const missingErrorArray = []
let j = 0
while (j < row.length) {
if (headersFound) {
// return;
} else {
if (headers.indexOf(row[j]) !== -1) {
let breakIndex
let rowStart = 0
let rowEnd = 0
let arrStart = 0
let foundHeadersArray: string[] = []
let spaceBreak = false
for (let i = j; i < row.length; i++) {
if (
row[i] === '' ||
(foundHeadersArray.indexOf(row[i]) !== -1 &&
this.isColHeader(row[i], parseParams.headerArray))
) {
if (row[i] === '') {
spaceBreak = true
}
breakIndex = i
if (!cell || cell.v === undefined) {
// This is an empty row, row before this one is the last row with data
endRow = cellsRow - 1
break
}
// Push to array of objects
if (!json[jsonRow]) json.push({})
if (ws[address].t === 'n') {
// If type is number, use the Underlying value, otherwise use Formatted text
// https://docs.sheetjs.com/docs/csf/cell
json[jsonRow][header] = ws[address].v
} else {
foundHeadersArray.push(row[i])
if (ws[address].w) {
json[jsonRow][header] = ws[address].w
} else {
json[jsonRow][header] = typeof ws[address].v === 'string' ? ws[address].v.trim() : ws[address].v
}
}
}
let tempArray: string[] = []
// Push to array of arrays, but with all cell meta info
if (!arrayData[jsonRow]) arrayData.push([])
arrayData[jsonRow].push(cell ?? { v: '' })
if (breakIndex !== undefined) {
tempArray = row.slice(j, breakIndex)
arrStart = j
rowEnd = breakIndex
if (spaceBreak) {
rowStart = j
j = breakIndex
} else {
rowStart = j
j = breakIndex - 1
}
} else {
tempArray = row.slice(j)
rowStart = j
arrStart = j
rowEnd = row.length
j = row.length
}
let foundHeaders = true
//We check if there are missing headers
headers.forEach((x) => {
if (tempArray.indexOf(x) === -1) {
foundHeaders = false
}
})
if (foundHeaders) {
headersFound = true
let mapHeaders: any[] = headers
let csvArrayHeadersMap = mapHeaders.reduce(function (map, obj) {
map[obj] = -1
return map
}, {})
let temp = row.slice(rowStart, rowEnd)
headers.forEach((x) => {
csvArrayHeadersMap[x] = temp.indexOf(x) + rowStart
})
return [true, csvArrayHeadersMap]
} else {
let missingHeaders = getMissingHeaders(tempArray, headers)
let missingMessage = '<b>TAB(' + tabName + ')</b>'
missingErrorArray.push([
missingMessage +
' - ' +
missingHeaders[1].join(',') +
' ( missing ' +
missingHeaders[0].join(',') +
' )',
missingHeaders[1].length
])
cellsRow++
jsonRow++
}
}
if (headerStartAddress && headerEndAddress) {
const endHeaderAddressLetterRegex = headerEndAddress.match(/\D+/)
rangeStartAddress = headerStartAddress
rangeEndAddress = `${endHeaderAddressLetterRegex}${endRow}`
}
})
// Remove leftover elements with missing pk values
const rowsWithMissingPk: number[] = []
let firstRowIndexMissingPk: number | undefined
json.forEach((row: any, rowIndex: number) => {
let missingPk = false
parseParams.headerPks.forEach(pkHeader => {
if (row[pkHeader.toLowerCase()] === undefined) missingPk = true
})
if (missingPk) {
rowsWithMissingPk.push(rowIndex)
if (!firstRowIndexMissingPk) firstRowIndexMissingPk = rowIndex
}
})
// Remove all rows after the first row with missing PK column even if some
// columns after has populated PK
if (firstRowIndexMissingPk) {
json.splice(firstRowIndexMissingPk, Infinity)
arrayData.splice(firstRowIndexMissingPk, Infinity)
} else {
// Fallback: Remove only rows with missing PK
rowsWithMissingPk.sort((a,b) => b - a).forEach(index => {
json.splice(index, 1)
arrayData.splice(index, 1)
})
}
if (!arrayData.length) {
return {}
}
// If we got to this point it means headers are matched
return {
found: {
data: json,
arrayData: arrayData,
sheetName: sheetName,
startAddress: rangeStartAddress,
endAddress: rangeEndAddress,
headers: foundHeaders
}
}
j++
}
return [false, missingErrorArray]
// No complete data found
return {
missing: missing
}
}
private isColHeader(col: string, headerArray: string[]) {
return headerArray.indexOf(col.toUpperCase()) > -1
}
/**
* Function that updates the !ref range value provided in official docs.
* @param ws worksheet to be updated
*/
private update_sheet_range(ws: XLSX.WorkSheet) {
const range = { s: { r: Infinity, c: Infinity }, e: { r: 0, c: 0 } }
Object.keys(ws)
.filter(function (x) {
return x.charAt(0) != '!'
})
.map(XLSX.utils.decode_cell)
.forEach(function (x: any) {
range.s.c = Math.min(range.s.c, x.c)
range.s.r = Math.min(range.s.r, x.r)
range.e.c = Math.max(range.e.c, x.c)
range.e.r = Math.max(range.e.r, x.r)
})
ws['!ref'] = XLSX.utils.encode_range(range)
}
/**
* When excel is password protected we will display the password prompt for user to type password in.
* @returns Password user input or undefined if discarded by user
*/
// private promptExcelPassword(): Promise<string | undefined> {
// return new Promise((resolve, reject) => {
// this.excelPasswordModalService.open().subscribe((result: Result) => {
// resolve(result.password)
// })
// })
// }
private updateDateTimeCols(
headers: any,
data: any,