dc/client/src/app/shared/spreadsheet-util/spreadsheet-util.ts
Mihajlo Medjedovic 6547461637
Some checks failed
Build / Build-and-ng-test (pull_request) Failing after 44s
fix(multi load): xlsx read file ahead of time, while user choose datasets
2024-08-09 16:09:53 +02:00

918 lines
28 KiB
TypeScript

import { isSpecialMissing } from '@sasjs/utils/input/validators'
import {
dateFormat,
dateToUtcTime,
dateToTime
} from 'src/app/editor/utils/date.utils'
import {
getMissingHeaders,
excelDateToJSDate
} from 'src/app/editor/utils/grid.utils'
import {
isStringNumber,
isStringDecimal
} from 'src/app/editor/utils/types.utils'
import { FileUploader } from 'src/app/models/FileUploader.class'
import SheetInfo from 'src/app/models/SheetInfo'
import { blobToFile } from 'src/app/xlmap/utils/file.utils'
import * as XLSX from '@sheet/crypto'
import { LicenceState } from 'src/app/models/LicenceState'
import { BehaviorSubject } from 'rxjs'
import { ParseParams } from 'src/app/models/ParseParams.interface'
import { ParseResult } from 'src/app/models/ParseResult.interface'
import { OpenOptions } from '../excel-password-modal/models/options.interface'
/**
* Used in combination with buffer
*/
import * as iconv from 'iconv-lite'
/**
* In combination with `iconv` is used for encoding json data captured with sheet js from excel file into a file again
* Which will be send to backend
*/
import { Buffer } from 'buffer'
import { MissingHeaders } from 'src/app/models/RangeInfo'
import { SearchDataExcelResult } from 'src/app/models/SearchDataExcelResult.interface'
type AOA = any[][]
export interface ConstructorParams {
licenceState: BehaviorSubject<LicenceState>
}
export class SpreadsheetUtil {
private licenceState: BehaviorSubject<LicenceState>
constructor(params: ConstructorParams) {
this.licenceState = params.licenceState
}
/**
* Parses attached file and searches for the matching data
*
* If CSV is provided no searching of the data will be executed, but csv file
* returned back in an FileUploader array
*
* @param promptExcelPassword used to trigger the modal for password input
* when provided file is locked
* @param parseParams params required for parsing the file
* @param onParseStateChange callback used to inform about parsing state
* so the user of the function can update the UI with latest info
* @param onTableFoundEvent callback fired when table range is found in the file
*
* @returns parsed list of files to upload and JSON data ready for HOT usage
*/
public parseSpreadsheetFile(
parseParams: ParseParams,
promptExcelPassword: (options?: OpenOptions) => Promise<string | undefined>,
onParseStateChange?: (uploadState: string) => void,
onTableFoundEvent?: (info: string) => void
): Promise<ParseResult | undefined> {
return new Promise((resolve, reject) => {
const uploader: FileUploader = parseParams.uploader || new FileUploader()
// If workbook is present it means file is already read and we don't need
// to read it again, otherwise we will do a XLSX.read()
if (parseParams.workbook) {
this.parseExcelFile(
parseParams,
parseParams.workbook,
uploader,
onTableFoundEvent
)
.then((response) => {
resolve(response)
})
.catch((err) => {
reject(err)
})
} else {
// File is not read so we must do a XLSX.read()
let data: any[] = []
const file: File = parseParams.file
if (!parseParams.encoding) parseParams.encoding = 'UTF-8'
if (onParseStateChange)
onParseStateChange(`Loading ${file.name} into the browser`)
let fileType = file.name.slice(
file.name.lastIndexOf('.') + 1,
file.name.lastIndexOf('.') + 4
)
if (fileType.toLowerCase() === 'xls') {
let reader: FileReader = new FileReader()
const self = this
reader.onload = async (fileReaderResponse: any) => {
const wb = await this.xslxStartReading(
fileReaderResponse,
promptExcelPassword,
parseParams.password
)
if (!wb) {
return reject('No workbook found.')
}
this.parseExcelFile(parseParams, wb, uploader, onTableFoundEvent)
.then((response) => {
resolve(response)
})
.catch((err) => {
reject(err)
})
}
reader.readAsArrayBuffer(file)
} else if (fileType.toLowerCase() === 'csv') {
return this.parseCsvFile(parseParams, uploader, fileType)
} else {
let abortMsg =
'Invalid file type "<b>' +
parseParams.file.name +
'</b>". Please upload csv or excel file.'
uploader.queue.pop()
return reject(abortMsg)
}
}
})
}
private parseExcelFile(
parseParams: ParseParams,
workbook: XLSX.WorkBook,
uploader: FileUploader,
onTableFoundEvent?: (info: string) => void
): Promise<ParseResult | undefined> {
return new Promise((resolve, reject) => {
/* save data */
let isComplete: boolean = false
let missingHeaders: MissingHeaders[] = []
const csvArrayHeaders: string[] = [
'_____DELETE__THIS__RECORD_____',
...parseParams.headerArray
]
const searchResult = this.searchDataInExcel(workbook, parseParams)
let csvArrayData: any[] = []
if (searchResult.found) {
isComplete = true
csvArrayData = searchResult.found.arrayData
if (
!searchResult.found.headers.includes('_____delete__this__record_____')
) {
csvArrayData = csvArrayData.map((row: any[]) => {
// Add empty val on start of the column to compensate for _____delete__this__record_____
// when not found in the file
row.unshift({ v: '' })
return row
})
}
if (onTableFoundEvent)
onTableFoundEvent(
`Sheet: ${searchResult.found.sheetName}\nRange: ${searchResult.found.startAddress}:${searchResult.found.endAddress}`
)
} else {
missingHeaders = searchResult.missing || []
}
if (missingHeaders.length > 0) {
let abortMsg = missingHeaders
.map((mh) => {
return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
})
.join('\n\n')
uploader.queue.pop()
return reject(abortMsg)
}
// If first row is empty, that means no data has been found
if (csvArrayData.length === 0 || csvArrayData[0].length === 0) {
let abortMsg = 'No relevant data found in File !'
uploader.queue.pop()
return reject(abortMsg)
}
if (
parseParams.dateTimeHeaders.length > 0 ||
parseParams.dateHeaders.length > 0 ||
parseParams.timeHeaders.length > 0
) {
csvArrayData = this.updateDateTimeCols(
csvArrayHeaders,
csvArrayData,
parseParams
)
}
if (parseParams.xlRules.length > 0) {
csvArrayData = this.updateXLRuleCols(
csvArrayHeaders,
csvArrayData,
parseParams
)
}
if (!isComplete) {
let abortMsg = ''
if (missingHeaders.length === 0) {
abortMsg = 'No relevant data found in File !'
} else {
abortMsg = missingHeaders
.map((mh) => {
return `Sheet: ${mh.sheetName}.\nMissing columns: ${mh.missingHeaders.join(',')}`
})
.join('\n\n')
}
// abort message is fired, return undefined
uploader.queue.pop()
return reject(abortMsg)
} else {
parseParams.headerShow = csvArrayHeaders
// Remove the metadata from the cells, leave only values
csvArrayData = csvArrayData.map((row: any) =>
row.map((col: any) => {
if (col.t === 'n') {
return col.v
} else {
if (col.w) return col.v
return typeof col.v === 'string' ? col.v.trim() : col.v
}
})
)
csvArrayData = csvArrayData.map((row: any) => {
return row.map((col: any, index: number) => {
if (!col && col !== 0) col = ''
/**
* Keeping this for the reference
* Code below used to convert JSON to CSV
* now the XLSX is converting to CSV
*/
// if (isNaN(col)) {
// // Match and replace the double quotes, ignore the first and last char
// // in case they are double quotes already
// col = col.replace(/(?<!^)"(?!$)/g, '""')
// if (col.search(/,/g) > -1 ||
// col.search(/\r|\n/g) > -1
// ) {
// // Missing quotes at the end
// if (col.search(/"$/g) < 0) {
// col = col + '"' // So we add them
// }
// // Missing quotes at the start
// if (col.search(/^"/g) < 0) {
// col = '"' + col // So we add them
// }
// }
// }
const colName = parseParams.headerShow[index]
const colRule = parseParams.dcValidator?.getRule(colName)
if (colRule?.type === 'numeric') {
if (isSpecialMissing(col) && !col.includes('.')) col = '.' + col
}
return col
})
})
// Apply licence rows limitation if exists, it is only affecting data
// which will be send to SAS
const strippedCsvArrayData = csvArrayData.slice(
0,
this.licenceState.value.submit_rows_limit
)
// To submit to sas service, we need clean version of CSV of file
// attached. XLSX will do the parsing and heavy lifting
// First we create worksheet of json (data we extracted)
let ws = XLSX.utils.json_to_sheet(strippedCsvArrayData, {
skipHeader: true
})
// create CSV to be uploaded from worksheet
let csvContentClean = XLSX.utils.sheet_to_csv(ws)
// Prepend headers
csvContentClean = csvArrayHeaders.join(',') + '\n' + csvContentClean
// Blob from which CSV file will be created depending of the selected
// encoding
let blob: Blob
if (parseParams.encoding === 'WLATIN1') {
// WLATIN1
let encoded = iconv.decode(Buffer.from(csvContentClean), 'CP-1252')
blob = new Blob([encoded], { type: 'application/csv' })
} else {
// UTF-8
blob = new Blob([csvContentClean], { type: 'application/csv' })
}
let newCSVFile: File = blobToFile(blob, parseParams.file.name + '.csv')
uploader.addToQueue([newCSVFile])
}
if (csvArrayData.length === 0) {
return reject(
`Table in the file is empty. Data found on sheet: ${searchResult.found?.sheetName || ''}`
)
}
if (!searchResult.found) {
return reject(
`No relevant data found. 'found' object is empty, unexpected error occurred.`
)
}
const rangeStartAddress = searchResult.found.startAddress || ''
const rangeEndAddress = searchResult.found.endAddress || ''
return resolve({
uploader,
data: csvArrayData,
rangeSheetRes: {
found: !!searchResult.found,
sheetName: searchResult.found.sheetName,
rangeStartAddress: rangeStartAddress,
rangeEndAddress: rangeEndAddress,
rangeAddress: `${rangeStartAddress}:${rangeEndAddress}`,
missingHeaders: missingHeaders
},
headerShow: parseParams.headerShow
})
})
}
private parseCsvFile(
parseParams: ParseParams,
uploader: FileUploader,
fileType: string
) {
return new Promise((resolve, reject) => {
if (this.licenceState.value.submit_rows_limit !== Infinity) {
uploader.queue.pop()
return reject(
'Excel files only. To unlock CSV uploads, please contact support@datacontroller.io'
)
}
if (parseParams.encoding === 'WLATIN1') {
let reader = new FileReader()
const self = this
// Closure to capture the file information.
reader.onload = (theFile: any) => {
let encoded = iconv.decode(
Buffer.from(theFile.target.result),
'CP-1252'
)
let blob = new Blob([encoded], { type: fileType })
let encodedFile: File = blobToFile(blob, parseParams.file.name)
uploader.queue.pop()
uploader.addToQueue([encodedFile])
return resolve({
uploader
})
}
reader.readAsArrayBuffer(parseParams.file)
} else {
return resolve({
uploader
})
}
})
}
public bytesToMB(size: number): number {
return parseFloat((size / (1024 * 1024)).toFixed(2))
}
/**
* Wrapper function for XLSX.read() with integrated 'unlock' functionality
* Used by multi load component to load the file while user chooses the datasets
* to be updated
*
* @param fileReaderResponse response from the file reader
* @param promptExcelPassword password callback
* @param password password provided by the user
* @returns WorkBook
*/
public xslxStartReading(
fileReaderResponse: any,
promptExcelPassword: (options?: OpenOptions) => Promise<string | undefined>,
password?: string
): Promise<XLSX.WorkBook> {
return new Promise(async (resolve, reject) => {
/* read workbook */
const bstr = this.toBstr(fileReaderResponse.target.result)
let wb: XLSX.WorkBook | undefined = undefined
let fileUnlocking: boolean = false
const xlsxOptions: XLSX.ParsingOptions = {
type: 'binary',
cellDates: false,
cellFormula: true,
cellStyles: true,
cellNF: false,
cellText: false,
password: password
}
try {
wb = await this.xlsxRead(bstr, {
...xlsxOptions
})
} catch (err: any) {
if (err.message.toLowerCase().includes('password')) {
fileUnlocking = true
let passwordError = false
while (fileUnlocking) {
const password = await promptExcelPassword({
error: passwordError
})
if (password) {
try {
wb = await this.xlsxRead(bstr, {
...xlsxOptions,
password: password
})
fileUnlocking = false
passwordError = false
} catch (err: any) {
passwordError = true
if (!err.message.toLowerCase().includes('password')) {
fileUnlocking = false
}
}
} else {
fileUnlocking = false
}
}
} else {
return reject('Error reading the file')
}
}
if (!wb) return reject('Failed to parse a workbook')
return resolve(wb)
})
}
/**
* XLSX Read wrapper which uses Web Worker to read the file and not block
* the UI while reading. It will allow reading bigger files.
* If worker fails, fallback is regular file read.
* @param data
* @param opts
* @returns
*/
private xlsxRead(
data: any,
opts?: XLSX.ParsingOptions | undefined
): Promise<XLSX.WorkBook> {
return new Promise((resolve, reject) => {
if (opts && opts.password) {
console.info('Not using worker to parse the XLSX - has password')
// At the moment worker can't use crypto version of SheetJS because of
// 'global not defined' issue
return resolve(XLSX.read(data, opts))
}
if (typeof Worker === 'undefined') {
console.info(
'Not using worker to parse the XLSX - no Worker available in this environment'
)
// Web workers are not supported in this environment.
// You should add a fallback so that your program still executes correctly.
return resolve(XLSX.read(data, opts))
}
// Ultimately use Web Worker to parse the excel
console.info('Using worker to parse the XLSX')
const worker = new Worker(
new URL('../../spreadsheet.worker', import.meta.url)
)
worker.onmessage = ({ data }) => {
if (data.event === 'reading_end') {
resolve(data.workbook)
} else if (data.error) {
reject(data.error)
} else {
console.info(
'Worker failed to parse the XLSX - fallback to non worker parsing'
)
// Fallback to reading without Worker
resolve(XLSX.read(data, opts))
}
}
worker.postMessage({
data,
opts
})
// Big timeout (10 minutes) in case Worker fails and no response
// and read the file with fallback method without worker
setTimeout(() => {
return resolve(XLSX.read(data, opts))
}, 600 * 1000) // 10 minutes
})
}
private searchDataInExcel(
wb: XLSX.WorkBook,
parseParams: ParseParams
): SearchDataExcelResult {
let headerStartAddress: string | undefined
let headerEndAddress: string | undefined
let rangeStartAddress: string | undefined
let rangeEndAddress: string | undefined
let sheetName: string = ''
let missing: MissingHeaders[] = []
const csvArrayHeaders: string[] = [
'_____DELETE__THIS__RECORD_____',
...parseParams.headerArray
]
let csvArrayHeadersLower = csvArrayHeaders.map((x) => x.toLowerCase())
let csvArrayHeadersMap = csvArrayHeadersLower.reduce(
(map: any, obj: string) => {
map[obj] = -1
return map
},
{}
)
for (let sheet of wb.SheetNames) {
headerStartAddress = undefined
headerEndAddress = undefined
csvArrayHeadersMap = csvArrayHeadersLower.reduce(
(map: any, obj: string) => {
map[obj] = -1
return map
},
{}
)
let missingHeaders = []
sheetName = sheet
const ws: XLSX.WorkSheet = wb.Sheets[sheetName]
// Find the first header
Object.keys(ws).forEach((wsKey) => {
const cellValue = ws[wsKey].v
// If the cell does not have `v` property we ignore it, those are metadata properties
if (cellValue && typeof cellValue === 'string') {
const potentialHeader = cellValue.toLowerCase()
const headerIndex = csvArrayHeadersLower.indexOf(potentialHeader)
if (headerIndex > -1) {
csvArrayHeadersMap[potentialHeader] = wsKey
if (!headerStartAddress) headerStartAddress = wsKey
// Update on every found header, until the end, which will leave
// last found header address
headerEndAddress = wsKey
}
}
})
// If _____delete__this__record_____ is not found in the file, remove it from the array
if (csvArrayHeadersMap['_____delete__this__record_____'] === -1)
delete csvArrayHeadersMap['_____delete__this__record_____']
// Parse missing headers, if any, abort the search and jump to next sheet
missingHeaders = Object.keys(csvArrayHeadersMap).filter(
(header) => csvArrayHeadersMap[header] === -1
)
if (missingHeaders.length > 0) {
missing.push({
sheetName: sheetName,
missingHeaders: missingHeaders.map((header) => header.toUpperCase())
})
continue
}
// If no headers are missing, start parsing the data column by column
const foundHeaders = Object.keys(csvArrayHeadersMap)
let json: any = []
let arrayData: any = []
let endRow: number
// Sort the headers so first headers are primary key columns
const foundHeadersSorted = foundHeaders.sort((a: string, b: string) => {
const aIsPk = parseParams.headerPks.includes(a) ? 1 : 0
const bIsPk = parseParams.headerPks.includes(b) ? 1 : 0
return bIsPk - aIsPk
})
foundHeadersSorted.forEach((header) => {
const headerAddress = csvArrayHeadersMap[header]
const headerAddressLetterRegex = headerAddress.match(/\D+/)
const headerAddressNumberRegex = headerAddress.match(/\d+/)
const headerAddressLetter =
(headerAddressLetterRegex ? headerAddressLetterRegex[0] : -1) || -1
const headerAddressNumber = parseInt(
(headerAddressNumberRegex ? headerAddressNumberRegex[0] : -1) || -1
)
const firstDataRow = headerAddressNumber + 1
let jsonRow = 0
// If end row found, use it as a limit
if (endRow) {
for (let row = firstDataRow; row <= endRow; row++) {
const address = `${headerAddressLetter}${row}`
const cell = ws[address]
if (parseParams.headerPks.includes(header)) {
// If this column is primary key and has less rows, set new endRow
if (cell === undefined || cell.v === undefined) {
endRow = row
break
}
}
// Push to array of objects
if (!json[jsonRow]) json.push({})
if (cell)
json[jsonRow][header] =
typeof cell.v === 'string' ? cell.v.trim() : cell.v
// Push to array of arrays, but with all cell meta info
if (!arrayData[jsonRow]) arrayData.push([])
arrayData[jsonRow].push(cell ?? { v: '' })
jsonRow++
}
} else {
// If end row not found, go trough rows until empty PK row appears
let cellsRow = firstDataRow
while (endRow === undefined) {
const address = `${headerAddressLetter}${cellsRow}`
const cell = ws[address]
if (!cell || cell.v === undefined) {
// This is an empty row, row before this one is the last row with data
endRow = cellsRow - 1
break
}
// Push to array of objects
if (!json[jsonRow]) json.push({})
if (ws[address].t === 'n') {
// If type is number, use the Underlying value, otherwise use Formatted text
// https://docs.sheetjs.com/docs/csf/cell
json[jsonRow][header] = ws[address].v
} else {
if (ws[address].w) {
json[jsonRow][header] = ws[address].w
} else {
json[jsonRow][header] =
typeof ws[address].v === 'string'
? ws[address].v.trim()
: ws[address].v
}
}
// Push to array of arrays, but with all cell meta info
if (!arrayData[jsonRow]) arrayData.push([])
arrayData[jsonRow].push(cell ?? { v: '' })
cellsRow++
jsonRow++
}
}
if (headerStartAddress && headerEndAddress) {
const endHeaderAddressLetterRegex = headerEndAddress.match(/\D+/)
rangeStartAddress = headerStartAddress
rangeEndAddress = `${endHeaderAddressLetterRegex}${endRow}`
}
})
// Remove leftover elements with missing pk values
const rowsWithMissingPk: number[] = []
let firstRowIndexMissingPk: number | undefined
json.forEach((row: any, rowIndex: number) => {
let missingPk = false
parseParams.headerPks.forEach((pkHeader) => {
if (row[pkHeader.toLowerCase()] === undefined) missingPk = true
})
if (missingPk) {
rowsWithMissingPk.push(rowIndex)
if (!firstRowIndexMissingPk) firstRowIndexMissingPk = rowIndex
}
})
// Remove all rows after the first row with missing PK column even if some
// columns after has populated PK
if (firstRowIndexMissingPk) {
json.splice(firstRowIndexMissingPk, Infinity)
arrayData.splice(firstRowIndexMissingPk, Infinity)
} else {
// Fallback: Remove only rows with missing PK
rowsWithMissingPk
.sort((a, b) => b - a)
.forEach((index) => {
json.splice(index, 1)
arrayData.splice(index, 1)
})
}
if (!arrayData.length) {
return {}
}
// If we got to this point it means headers are matched
return {
found: {
data: json,
arrayData: arrayData,
sheetName: sheetName,
startAddress: rangeStartAddress,
endAddress: rangeEndAddress,
headers: foundHeaders
}
}
}
// No complete data found
return {
missing: missing
}
}
private updateDateTimeCols(
headers: any,
data: any,
parseParams: ParseParams
) {
if (parseParams.dateHeaders.length > 0) {
const dateCols: number[] = []
parseParams.dateHeaders.forEach((element: any) => {
if (headers.indexOf(element) !== -1) {
dateCols.push(headers.indexOf(element))
}
})
data.forEach((row: any[]) => {
dateCols.forEach((element) => {
const obj = row[element]
if (isStringNumber(obj.v)) {
const date = excelDateToJSDate(Number(obj.v))
obj.v =
date.getFullYear() +
'-' +
('0' + (date.getMonth() + 1)).slice(-2) +
'-' +
('0' + date.getDate()).slice(-2)
} else {
if (obj && obj.v && obj.v.toString().indexOf(':') === -1) {
const date = new Date(obj.v)
if (date.toUTCString() !== 'Invalid Date') {
obj.v = dateFormat(date)
}
}
}
row[element] = obj
})
})
}
if (parseParams.timeHeaders.length > 0) {
let timeCols: number[] = []
parseParams.timeHeaders.forEach((element: any) => {
if (headers.indexOf(element) !== -1) {
timeCols.push(headers.indexOf(element))
}
})
data.forEach((row: any[]) => {
timeCols.forEach((element) => {
const obj = row[element]
if (
isStringNumber(obj.v) ||
isStringDecimal(obj.v) ||
obj.v.includes('E-')
) {
const date = excelDateToJSDate(Number(obj.v))
obj.v = dateToUtcTime(date)
}
row[element] = obj
})
})
}
if (parseParams.dateTimeHeaders.length > 0) {
let dateTimeCols: number[] = []
parseParams.dateTimeHeaders.forEach((element: any) => {
if (headers.indexOf(element) !== -1) {
dateTimeCols.push(headers.indexOf(element))
}
})
data.forEach((row: any[]) => {
dateTimeCols.forEach((element) => {
const obj = row[element]
if (isStringNumber(obj.v) || isStringDecimal(obj.v)) {
const date = excelDateToJSDate(Number(obj.v))
obj.v = dateFormat(date) + ' ' + dateToUtcTime(date)
} else {
if (obj.v.indexOf(' ') === -1 && obj.v.indexOf(':') !== -1) {
let str = obj.v.substring(0, obj.v.indexOf(':'))
str = str + ' ' + obj.v.substring(obj.v.indexOf(':') + 1)
obj.v = str
}
const date = new Date(obj.v)
if (date.toUTCString() !== 'Invalid Date') {
obj.v = dateFormat(date) + ' ' + dateToTime(date)
}
}
row[element] = obj
})
})
}
return data
}
private updateXLRuleCols(headers: any, data: any, parseParams: ParseParams) {
if (parseParams.xlRules.length > 0) {
const xlRuleCols: any = []
parseParams.xlRules.forEach((element: any) => {
if (headers.indexOf(element.XL_COLUMN) !== -1) {
element['index'] = headers.indexOf(element.XL_COLUMN)
xlRuleCols.push(element)
}
})
data.forEach((row: any[]) => {
xlRuleCols.forEach((element: any) => {
const obj = row[element.index]
if (element.XL_RULE === 'FORMULA') {
if ('f' in obj) {
if (obj['t'] === 'n') {
obj['v'] = '=' + obj['f']
} else {
obj['w'] = '=' + obj['f']
}
}
}
row[element] = obj
})
})
}
return data
}
private toBstr(res: any) {
let bytes = new Uint8Array(res)
let binary = ''
let length = bytes.byteLength
for (let i = 0; i < length; i++) {
binary += String.fromCharCode(bytes[i])
}
return binary
}
}