feat: searching data in excel files using new algorithm (massive performance improvement)

2024-08-07 17:53:52 +02:00
parent 403d08c86a
commit bbb725c64c
5 changed files with 262 additions and 354 deletions
--- a/client/cypress/e2e/excel.cy.ts
+++ b/client/cypress/e2e/excel.cy.ts
@@ -109,13 +109,8 @@ context('excel tests: ', function () {
    openTableFromTree(libraryToOpenIncludes, 'mpe_x_test')

    attachExcelFile('duplicate_column_excel.xlsx', () => {
-      cy.get('.abortMsg', { timeout: longerCommandTimeout })
-        .should('exist')
-        .then((elements: any) => {
-          if (elements[0]) {
-            if (elements[0].innerText.toLowerCase().includes('missing')) done()
-          }
-        })
+      submitExcel()
+      rejectExcel(done)
    })
  })

--- a/client/src/app/models/ParseResult.interface.ts
+++ b/client/src/app/models/ParseResult.interface.ts
@@ -1,5 +1,5 @@
 import { FileUploader } from './FileUploader.class'
-import SheetInfo from './SheetInfo'
+import FoundRangeInfo from './RangeInfo'

 export interface ParseResult {
  /**
@@ -10,6 +10,6 @@ export interface ParseResult {
   * In case of CSV file, won't be returned
   */
  headerShow?: string[]
-  rangeSheetRes?: SheetInfo
+  rangeSheetRes?: FoundRangeInfo
  uploader: FileUploader
 }
--- a/client/src/app/models/RangeInfo.ts
+++ b/client/src/app/models/RangeInfo.ts
@@ -0,0 +1,13 @@
+export default interface FoundRangeInfo {
+  found: boolean
+  sheetName: string
+  rangeStartAddress: string
+  rangeEndAddress: string
+  rangeAddress: string
+  missingHeaders: MissingHeaders[]
+}
+
+export interface MissingHeaders {
+  sheetName: string
+  missingHeaders: string[]
+}
--- a/client/src/app/models/SearchDataExcelResult.interface.ts
+++ b/client/src/app/models/SearchDataExcelResult.interface.ts
@@ -0,0 +1,13 @@
+import { MissingHeaders } from "./RangeInfo"
+
+export interface SearchDataExcelResult {
+  missing?: MissingHeaders[],
+  found?: {
+    data: any,
+    arrayData: any[],
+    sheetName: string
+    headers: string[]
+    startAddress?: string
+    endAddress?: string
+  }
+}
--- a/client/src/app/shared/spreadsheet-util/spreadsheet-util.ts
+++ b/client/src/app/shared/spreadsheet-util/spreadsheet-util.ts
@@ -31,6 +31,8 @@ import * as iconv from 'iconv-lite'
 * Which will be send to backend
 */
 import { Buffer } from 'buffer'
+import { MissingHeaders } from 'src/app/models/RangeInfo'
+import { SearchDataExcelResult } from 'src/app/models/SearchDataExcelResult.interface'
 type AOA = any[][]

 export interface ConstructorParams {
@@ -61,12 +63,6 @@ export class SpreadsheetUtil {
    onTableFoundEvent?: (info: string) => void
  ): Promise<ParseResult | undefined> {
    return new Promise((resolve, reject) => {
-      // If file size is bigger then 2 MB we need to load its bytes in chunks
-      const sizeInMB = this.bytesToMB(parseParams.file.size)
-
-      if (sizeInMB > 2) {
-      }
-
      let data: any[] = []
      const uploader: FileUploader = parseParams.uploader || new FileUploader()

@@ -153,7 +149,7 @@ export class SpreadsheetUtil {

          /* save data */
          let isComplete: boolean = false
-          let missingHeaders: string[] = []
+          let missingHeaders: MissingHeaders[] = []

          const csvArrayHeaders: string[] = [
            '_____DELETE__THIS__RECORD_____',
@@ -169,83 +165,39 @@ export class SpreadsheetUtil {
            {}
          )

-          let csvArrayData: any[] = []
-          const rangeSheetRes: SheetInfo = this.getRangeAndSheet(
+          const searchResult = this.searchDataInExcel(
            wb,
            parseParams
          )
-          missingHeaders = rangeSheetRes.missingHeaders

-          if (rangeSheetRes.foundData) {
+          let csvArrayData: any[] = []
+
+          if (searchResult.found) {
            isComplete = true
-            csvArrayHeadersMap = rangeSheetRes.csvArrayHeadersMap
-            const ws: XLSX.WorkSheet = wb.Sheets[rangeSheetRes.sheetName]
+            csvArrayData = searchResult.found.arrayData

-            if (onParseStateChange)
-              onParseStateChange(
-                `Table found on sheet ${rangeSheetRes.sheetName} on row ${rangeSheetRes.startRow}`
-              )
+            if (!searchResult.found.headers.includes('_____delete__this__record_____')) {
+              csvArrayData = csvArrayData.map((row: any[]) => {
+                // Add empty val on start of the column to compensate for _____delete__this__record_____
+                // when not found in the file
+                row.unshift({ v: '' })

-            let startAddress = ''
-            let endAddress = ''
-
-            for (
-              let row = rangeSheetRes.startRow;
-              row < rangeSheetRes.endRow;
-              ++row
-            ) {
-              const arr: any[] = []
-
-              csvArrayHeadersLower.forEach((x) => {
-                const col = csvArrayHeadersMap[x]
-                const addr = XLSX.utils.encode_cell({
-                  r: rangeSheetRes.rangeStartRow + row,
-                  c: rangeSheetRes.rangeStartCol + col
-                })
-
-                let cell
-
-                if (!ws[addr]) {
-                  cell = { v: '' }
-                } else {
-                  cell = ws[addr]
-                }
-
-                if (startAddress === '' && ws[addr]) startAddress = addr
-                endAddress = addr
-
-                arr.push(cell)
+                return row
              })
-
-              // If we found at least one non empty value it means it is not empty row
-              // othervise, it is empty row
-              let arrNonEmptyValue = arr.find((x) => x.v !== '')
-
-              if (arrNonEmptyValue) csvArrayData.push(arr)
            }

-            rangeSheetRes.rangeAddress = `${startAddress}:${endAddress}`
-
            if (onTableFoundEvent)
              onTableFoundEvent(
-                `Sheet: ${rangeSheetRes.sheetName}\nRange: ${rangeSheetRes.rangeAddress}`
+                `Sheet: ${searchResult.found.sheetName}\nRange: ${searchResult.found.startAddress}:${searchResult.found.endAddress}`
              )
          } else {
-            missingHeaders = rangeSheetRes.missingHeaders
+            missingHeaders = searchResult.missing || []
          }

          if (missingHeaders.length > 0) {
-            missingHeaders.sort(function compareSecondColumn(a, b) {
-              if (a[1] === b[1]) {
-                return 0
-              } else {
-                return a[1] > b[1] ? -1 : 1
-              }
-            })
-            let abortMsg = missingHeaders
-              .map((x) => x[0])
-              .slice(0, 5)
-              .join('\n')
+            let abortMsg = missingHeaders.map(mh => {
+              return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
+            }).join('\n\n')

            uploader.queue.pop()
            return reject(abortMsg)
@@ -285,17 +237,9 @@ export class SpreadsheetUtil {
            if (missingHeaders.length === 0) {
              abortMsg = 'No relevant data found in File !'
            } else {
-              missingHeaders.sort(function compareSecondColumn(a, b) {
-                if (a[1] === b[1]) {
-                  return 0
-                } else {
-                  return a[1] > b[1] ? -1 : 1
-                }
-              })
-              abortMsg = missingHeaders
-                .map((x) => x[0])
-                .slice(0, 5)
-                .join('\n')
+              abortMsg = missingHeaders.map(mh => {
+                return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
+              }).join('\n\n')
            }

            // abort message is fired, return undefined
@@ -303,8 +247,18 @@ export class SpreadsheetUtil {
            return reject(abortMsg)
          } else {
            parseParams.headerShow = csvArrayHeaders
+
+            // Remove the metadata from the cells, leave only values
            csvArrayData = csvArrayData.map((row: any) =>
-              row.map((col: any) => (col.t === 'n' ? col.v : col.w))
+              row.map((col: any) => {
+                if (col.t === 'n') {
+                  return col.v
+                } else {
+                  if (col.w) return col.v
+
+                  return typeof col.v === 'string' ? col.v.trim() : col.v
+                }
+              })
            )

            csvArrayData = csvArrayData.map((row: any) => {
@@ -394,10 +348,26 @@ export class SpreadsheetUtil {
            )
          }

+          if (!searchResult.found) {
+            return reject(
+              `No relevant data found. 'found' object is empty, unexpected error occurred.`
+            )
+          }
+
+          const rangeStartAddress = searchResult.found.startAddress || ''
+          const rangeEndAddress = searchResult.found.endAddress || ''
+
          return resolve({
            uploader,
-            data,
-            rangeSheetRes,
+            data: csvArrayData,
+            rangeSheetRes: {
+              found: !!searchResult.found,
+              sheetName: searchResult.found.sheetName,
+              rangeStartAddress: rangeStartAddress,
+              rangeEndAddress: rangeEndAddress,
+              rangeAddress: `${rangeStartAddress}:${rangeEndAddress}`,
+              missingHeaders: missingHeaders,
+            },
            headerShow: parseParams.headerShow
          })
        }
@@ -425,7 +395,7 @@ export class SpreadsheetUtil {
            uploader.queue.pop()
            uploader.addToQueue([encodedFile])

-            resolve({
+            return resolve({
              uploader
            })
          }
@@ -515,24 +485,17 @@ export class SpreadsheetUtil {
    })
  }

-  /**
-   * Function that gives the sheet name which contains data and range of data in that sheet, if some headers are missing then also gives the info about those missing headers
-   * @param wb Excel workbook
-   * @returns {object: SheetInfo} an object which contains necessary information about workbook that which sheet contains required data and what's the range
-   */
-  private getRangeAndSheet(
+  private searchDataInExcel(
    wb: XLSX.WorkBook,
    parseParams: ParseParams
-  ): SheetInfo {
-    let data = []
-
-    let rangeStartRow: number = 0
-    let rangeStartCol: number = 0
-    let startRow: number = -1
-    let endRow: number = -1
+  ): SearchDataExcelResult {
+    let headerStartAddress: string | undefined
+    let headerEndAddress: string | undefined
+    let rangeStartAddress: string | undefined
+    let rangeEndAddress: string | undefined
    let sheetName: string = ''
-    let isComplete = false
-    let missingHeaders: string[] = []
+    let missing: MissingHeaders[] = []
+
    const csvArrayHeaders: string[] = [
      '_____DELETE__THIS__RECORD_____',
      ...parseParams.headerArray
@@ -546,284 +509,208 @@ export class SpreadsheetUtil {
      {}
    )

-    wb.SheetNames.forEach((element: string) => {
-      // Checking for required data in each sheet in workbook/
-      if (isComplete) {
-        return
-      }
+    for (let sheet of wb.SheetNames) {
+      headerStartAddress = undefined
+      headerEndAddress = undefined

-      missingHeaders = []
-      sheetName = element
+      csvArrayHeadersMap = csvArrayHeadersLower.reduce(
+        (map: any, obj: string) => {
+          map[obj] = -1
+          return map
+        },
+        {}
+      )
+
+      let missingHeaders = []
+      sheetName = sheet
      const ws: XLSX.WorkSheet = wb.Sheets[sheetName]

-      data = <AOA>XLSX.utils.sheet_to_json(ws, {
-        header: 1,
-        blankrows: true, // Without empty rows, if another table is below a table separated by the empty rows, startRow index is wrong
-        defval: ''
+      // Find the first header
+      Object.keys(ws).forEach(wsKey => {
+        const cellValue = ws[wsKey].v
+
+        // If the cell does not have `v` property we ignore it, those are metadata properties
+        if (cellValue && typeof cellValue === 'string') {
+          const potentialHeader = cellValue.toLowerCase()
+          const headerIndex = csvArrayHeadersLower.indexOf(potentialHeader)
+
+          if (headerIndex > -1) {
+            csvArrayHeadersMap[potentialHeader] = wsKey
+
+            if (!headerStartAddress) headerStartAddress = wsKey
+
+            // Update on every found header, until the end, which will leave
+            // last found header address
+            headerEndAddress = wsKey
+          }
+        }
      })

-      if (data.length <= 1) {
-        return
+      // If _____delete__this__record_____ is not found in the file, remove it from the array
+      if (csvArrayHeadersMap['_____delete__this__record_____'] === -1) delete csvArrayHeadersMap['_____delete__this__record_____']
+
+      // Parse missing headers, if any, abort the search and jump to next sheet
+      missingHeaders = Object.keys(csvArrayHeadersMap).filter(header => csvArrayHeadersMap[header] === -1)
+
+      if (missingHeaders.length > 0) {
+        missing.push({
+          sheetName: sheetName,
+          missingHeaders: missingHeaders.map(header => header.toUpperCase())
+        })
+
+        continue
      }

-      let tempArr: string[] = []
-      parseParams.headerArray.forEach(() => tempArr.push(''))
-      data.push(tempArr)
+      // If no headers are missing, start parsing the data column by column
+      const foundHeaders = Object.keys(csvArrayHeadersMap)

-      let foundHeaders = false
+      let json: any = []
+      let arrayData: any = []
+      let endRow: number

-      data.forEach((row: any, index: number) => {
-        if (isComplete) {
-          return
-        }
+      // Sort the headers so first headers are primary key columns
+      const foundHeadersSorted = foundHeaders.sort((a: string, b: string) => {
+        const aIsPk = parseParams.headerPks.includes(a) ? 1 : 0
+        const bIsPk = parseParams.headerPks.includes(b) ? 1 : 0

-        if (foundHeaders) {
-          let isDataEnd = true
-          let isPkNull = false
+        return bIsPk - aIsPk
+      })

-          csvArrayHeadersLower.forEach((x) => {
-            const col = csvArrayHeadersMap[x]
+      foundHeadersSorted.forEach(header => {
+        const headerAddress = csvArrayHeadersMap[header]
+        const headerAddressLetterRegex = headerAddress.match(/\D+/)
+        const headerAddressNumberRegex = headerAddress.match(/\d+/)

-            if (row[col] !== '' && row[col] !== undefined) {
-              isDataEnd = false
-            } else {
-              if (parseParams.headerPks.indexOf(x.toUpperCase()) !== -1) {
-                isPkNull = true
+        const headerAddressLetter = (headerAddressLetterRegex ? headerAddressLetterRegex[0] : -1) || -1
+        const headerAddressNumber = parseInt((headerAddressNumberRegex ? headerAddressNumberRegex[0] : -1) || -1)
+
+        const firstDataRow = headerAddressNumber + 1
+
+        let jsonRow = 0
+
+        // If end row found, use it as a limit
+        if (endRow) {
+          for (let row = firstDataRow; row <= endRow; row++) {
+            const address = `${headerAddressLetter}${row}`
+            const cell = ws[address]
+
+            if (parseParams.headerPks.includes(header)) {
+              // If this column is primary key and has less rows, set new endRow
+              if (cell === undefined || cell.v === undefined) {
+                endRow = row
+                break
              }
            }
-          })

-          if (isDataEnd || isPkNull) {
-            endRow = index
-            isComplete = true
-          } else {
-            if (startRow === -1) {
-              startRow = index
-            }
+            // Push to array of objects
+            if (!json[jsonRow]) json.push({})
+            if (cell) json[jsonRow][header] = typeof cell.v === 'string' ? cell.v.trim() : cell.v
+
+            // Push to array of arrays, but with all cell meta info
+            if (!arrayData[jsonRow]) arrayData.push([])
+            arrayData[jsonRow].push(cell ?? { v: '' })
+
+            jsonRow++
          }
        } else {
-          const rowLowerCase: string[] = row.map((x: any) =>
-            x.toString().toLowerCase()
-          )
+          // If end row not found, go trough rows until empty PK row appears
+          let cellsRow = firstDataRow

-          // If in file there is no delete column, remove it from search of missing.
-          // This way delete column will be optional to provide in file
-          if (!rowLowerCase.includes('_____delete__this__record_____')) {
-            const deleteIndex = csvArrayHeadersLower.indexOf(
-              '_____delete__this__record_____'
-            )
+          while (endRow === undefined) {
+            const address = `${headerAddressLetter}${cellsRow}`
+            const cell = ws[address]

-            if (deleteIndex > -1) csvArrayHeadersLower.splice(deleteIndex, 1)
-          }
-
-          foundHeaders = true
-
-          csvArrayHeadersLower.forEach((x) => {
-            if (rowLowerCase.indexOf(x) === -1) {
-              foundHeaders = false
-            }
-          })
-
-          let result = []
-
-          result = this.findValidHeaders(
-            rowLowerCase,
-            csvArrayHeadersLower,
-            index,
-            sheetName,
-            parseParams
-          )
-
-          if (result[0] === false) {
-            foundHeaders = false
-
-            if (result[1].length > 0) {
-              result[1].forEach((data: string) => {
-                missingHeaders.push(data)
-              })
-            }
-          } else {
-            csvArrayHeadersMap = result[1]
-          }
-        }
-      })
-
-      if (isComplete) {
-        this.update_sheet_range(ws)
-        const worksheetSel = ws['!ref']
-
-        if (worksheetSel) {
-          const range = XLSX.utils.decode_range(ws['!ref'] || '')
-          rangeStartRow = range.s.r
-          rangeStartCol = range.s.c
-        }
-      }
-    })
-
-    // If start row is still -1 that means first row of found range is empty
-    if (startRow === -1) isComplete = false
-
-    const returnObj: SheetInfo = {
-      foundData: isComplete,
-      sheetName,
-      startRow,
-      endRow,
-      csvArrayHeadersMap,
-      missingHeaders,
-      rangeStartRow,
-      rangeStartCol
-    }
-
-    return returnObj
-  }
-
-  private findValidHeaders(
-    row: string[],
-    headers: string[],
-    rowNumber: number,
-    tabName: string,
-    parseParams: ParseParams
-  ): Array<any> {
-    let headersFound = false
-    const missingErrorArray = []
-    let j = 0
-
-    while (j < row.length) {
-      if (headersFound) {
-        // return;
-      } else {
-        if (headers.indexOf(row[j]) !== -1) {
-          let breakIndex
-          let rowStart = 0
-          let rowEnd = 0
-          let arrStart = 0
-          let foundHeadersArray: string[] = []
-          let spaceBreak = false
-
-          for (let i = j; i < row.length; i++) {
-            if (
-              row[i] === '' ||
-              (foundHeadersArray.indexOf(row[i]) !== -1 &&
-                this.isColHeader(row[i], parseParams.headerArray))
-            ) {
-              if (row[i] === '') {
-                spaceBreak = true
-              }
-
-              breakIndex = i
+            if (!cell || cell.v === undefined) {
+              // This is an empty row, row before this one is the last row with data
+              endRow = cellsRow - 1
              break
+            }
+
+            // Push to array of objects
+            if (!json[jsonRow]) json.push({})
+            if (ws[address].t === 'n') {
+              // If type is number, use the Underlying value, otherwise use Formatted text
+              // https://docs.sheetjs.com/docs/csf/cell
+              json[jsonRow][header] = ws[address].v
            } else {
-              foundHeadersArray.push(row[i])
+              if (ws[address].w) {
+                json[jsonRow][header] = ws[address].w
+              } else {
+                json[jsonRow][header] = typeof ws[address].v === 'string' ? ws[address].v.trim() : ws[address].v
+              }
            }
-          }

-          let tempArray: string[] = []
+            // Push to array of arrays, but with all cell meta info
+            if (!arrayData[jsonRow]) arrayData.push([])
+            arrayData[jsonRow].push(cell ?? { v: '' })

-          if (breakIndex !== undefined) {
-            tempArray = row.slice(j, breakIndex)
-            arrStart = j
-            rowEnd = breakIndex
-
-            if (spaceBreak) {
-              rowStart = j
-              j = breakIndex
-            } else {
-              rowStart = j
-              j = breakIndex - 1
-            }
-          } else {
-            tempArray = row.slice(j)
-            rowStart = j
-            arrStart = j
-            rowEnd = row.length
-            j = row.length
-          }
-
-          let foundHeaders = true
-
-          //We check if there are missing headers
-          headers.forEach((x) => {
-            if (tempArray.indexOf(x) === -1) {
-              foundHeaders = false
-            }
-          })
-
-          if (foundHeaders) {
-            headersFound = true
-
-            let mapHeaders: any[] = headers
-
-            let csvArrayHeadersMap = mapHeaders.reduce(function (map, obj) {
-              map[obj] = -1
-              return map
-            }, {})
-
-            let temp = row.slice(rowStart, rowEnd)
-
-            headers.forEach((x) => {
-              csvArrayHeadersMap[x] = temp.indexOf(x) + rowStart
-            })
-
-            return [true, csvArrayHeadersMap]
-          } else {
-            let missingHeaders = getMissingHeaders(tempArray, headers)
-
-            let missingMessage = '<b>TAB(' + tabName + ')</b>'
-            missingErrorArray.push([
-              missingMessage +
-                ' - ' +
-                missingHeaders[1].join(',') +
-                ' ( missing ' +
-                missingHeaders[0].join(',') +
-                ' )',
-              missingHeaders[1].length
-            ])
+            cellsRow++
+            jsonRow++
          }
        }
+
+        if (headerStartAddress && headerEndAddress) {
+          const endHeaderAddressLetterRegex = headerEndAddress.match(/\D+/)
+
+          rangeStartAddress = headerStartAddress
+          rangeEndAddress = `${endHeaderAddressLetterRegex}${endRow}`
+        }
+      })
+
+      // Remove leftover elements with missing pk values
+      const rowsWithMissingPk: number[] = []
+      let firstRowIndexMissingPk: number | undefined
+
+      json.forEach((row: any, rowIndex: number) => {
+        let missingPk = false
+
+        parseParams.headerPks.forEach(pkHeader => {
+          if (row[pkHeader.toLowerCase()] === undefined) missingPk = true
+        })
+
+        if (missingPk) {
+          rowsWithMissingPk.push(rowIndex)
+
+          if (!firstRowIndexMissingPk) firstRowIndexMissingPk = rowIndex
+        }
+      })
+
+      // Remove all rows after the first row with missing PK column even if some
+      // columns after has populated PK
+      if (firstRowIndexMissingPk) {
+        json.splice(firstRowIndexMissingPk, Infinity)
+        arrayData.splice(firstRowIndexMissingPk, Infinity)
+      } else {
+        // Fallback: Remove only rows with missing PK
+        rowsWithMissingPk.sort((a,b) => b - a).forEach(index => {
+          json.splice(index, 1)
+          arrayData.splice(index, 1)
+        })
+      }
+
+      if (!arrayData.length) {
+        return {}
+      }
+
+      // If we got to this point it means headers are matched
+      return {
+        found: {
+          data: json,
+          arrayData: arrayData,
+          sheetName: sheetName,
+          startAddress: rangeStartAddress,
+          endAddress: rangeEndAddress,
+          headers: foundHeaders
+        }
      }
-      j++
    }
-    return [false, missingErrorArray]
+
+    // No complete data found
+    return {
+      missing: missing
+    }
  }

-  private isColHeader(col: string, headerArray: string[]) {
-    return headerArray.indexOf(col.toUpperCase()) > -1
-  }
-
-  /**
-   * Function that updates the !ref range value provided in official docs.
-   * @param ws worksheet to be updated
-   */
-  private update_sheet_range(ws: XLSX.WorkSheet) {
-    const range = { s: { r: Infinity, c: Infinity }, e: { r: 0, c: 0 } }
-
-    Object.keys(ws)
-      .filter(function (x) {
-        return x.charAt(0) != '!'
-      })
-      .map(XLSX.utils.decode_cell)
-      .forEach(function (x: any) {
-        range.s.c = Math.min(range.s.c, x.c)
-        range.s.r = Math.min(range.s.r, x.r)
-        range.e.c = Math.max(range.e.c, x.c)
-        range.e.r = Math.max(range.e.r, x.r)
-      })
-
-    ws['!ref'] = XLSX.utils.encode_range(range)
-  }
-
-  /**
-   * When excel is password protected we will display the password prompt for user to type password in.
-   * @returns Password user input or undefined if discarded by user
-   */
-  // private promptExcelPassword(): Promise<string | undefined> {
-  //   return new Promise((resolve, reject) => {
-  //     this.excelPasswordModalService.open().subscribe((result: Result) => {
-  //       resolve(result.password)
-  //     })
-  //   })
-  // }
-
  private updateDateTimeCols(
    headers: any,
    data: any,