feat: searching data in excel files using new algorithm (massive performance improvement) #123

Merged
allan merged 5 commits from issue-120 into main 2024-08-31 14:23:48 +00:00
13 changed files with 786 additions and 702 deletions

View File

@ -109,13 +109,8 @@ context('excel tests: ', function () {
openTableFromTree(libraryToOpenIncludes, 'mpe_x_test') openTableFromTree(libraryToOpenIncludes, 'mpe_x_test')
attachExcelFile('duplicate_column_excel.xlsx', () => { attachExcelFile('duplicate_column_excel.xlsx', () => {
cy.get('.abortMsg', { timeout: longerCommandTimeout }) submitExcel()
.should('exist') rejectExcel(done)
.then((elements: any) => {
if (elements[0]) {
if (elements[0].innerText.toLowerCase().includes('missing')) done()
}
})
}) })
}) })

View File

@ -18,6 +18,7 @@ import {
ClarityIcons, ClarityIcons,
exclamationTriangleIcon, exclamationTriangleIcon,
moonIcon, moonIcon,
processOnVmIcon,
sunIcon, sunIcon,
tableIcon, tableIcon,
trashIcon trashIcon
@ -28,7 +29,8 @@ ClarityIcons.addIcons(
sunIcon, sunIcon,
exclamationTriangleIcon, exclamationTriangleIcon,
tableIcon, tableIcon,
trashIcon trashIcon,
processOnVmIcon
) )
@Component({ @Component({

View File

@ -2,11 +2,17 @@ import { DcValidator } from '../shared/dc-validator/dc-validator'
import { FileUploadEncoding } from './FileUploadEncoding' import { FileUploadEncoding } from './FileUploadEncoding'
import { FileUploader } from './FileUploader.class' import { FileUploader } from './FileUploader.class'
import { ExcelRule } from './TableData' import { ExcelRule } from './TableData'
import XLSX from 'xlsx'
export interface ParseParams { export interface ParseParams {
file: File file: File
password?: string password?: string
dcValidator: DcValidator dcValidator: DcValidator
/**
* If workbook is provided, parse function will not run a XLSX.read()
* it will use this property instead. So the client must do a file read beforehand
*/
workbook?: XLSX.WorkBook
/** /**
* Parse function will manipulate and return the uploader array which can be provided with files already in the queue * Parse function will manipulate and return the uploader array which can be provided with files already in the queue
* Otherwise new empty instance will be created. * Otherwise new empty instance will be created.

View File

@ -1,5 +1,5 @@
import { FileUploader } from './FileUploader.class' import { FileUploader } from './FileUploader.class'
import SheetInfo from './SheetInfo' import FoundRangeInfo from './RangeInfo'
export interface ParseResult { export interface ParseResult {
/** /**
@ -10,6 +10,6 @@ export interface ParseResult {
* In case of CSV file, won't be returned * In case of CSV file, won't be returned
*/ */
headerShow?: string[] headerShow?: string[]
rangeSheetRes?: SheetInfo rangeSheetRes?: FoundRangeInfo
uploader: FileUploader uploader: FileUploader
} }

View File

@ -0,0 +1,13 @@
export default interface FoundRangeInfo {
found: boolean
sheetName: string
rangeStartAddress: string
rangeEndAddress: string
rangeAddress: string
missingHeaders: MissingHeaders[]
}
export interface MissingHeaders {
sheetName: string
missingHeaders: string[]
}

View File

@ -0,0 +1,13 @@
import { MissingHeaders } from './RangeInfo'
export interface SearchDataExcelResult {
missing?: MissingHeaders[]
found?: {
data: any
arrayData: any[]
sheetName: string
headers: string[]
startAddress?: string
endAddress?: string
}
}

View File

@ -198,6 +198,19 @@
*ngIf="!activeParsedDataset" *ngIf="!activeParsedDataset"
class="no-table-selected pointer-events-none" class="no-table-selected pointer-events-none"
> >
<ng-container *ngIf="fileLoadingState !== FileLoadingState.parsed">
<clr-icon
shape="process-on-vm"
size="40"
class="is-info icon-dc-fill"
></clr-icon>
<p class="text-center color-gray mt-10" cds-text="section">
{{ fileLoadingState }}...
</p>
</ng-container>
<ng-container *ngIf="fileLoadingState === FileLoadingState.parsed">
<clr-icon <clr-icon
shape="warning-standard" shape="warning-standard"
size="40" size="40"
@ -206,6 +219,7 @@
<p class="text-center color-gray mt-10" cds-text="section"> <p class="text-center color-gray mt-10" cds-text="section">
Please select a dataset on the left to review the data Please select a dataset on the left to review the data
</p> </p>
</ng-container>
</div> </div>
<ng-container *ngIf="activeParsedDataset"> <ng-container *ngIf="activeParsedDataset">

View File

@ -30,6 +30,17 @@ import { UploadFile } from '@sasjs/adapter'
import { UploadFileResponse } from '../models/UploadFile' import { UploadFileResponse } from '../models/UploadFile'
import { RequestWrapperResponse } from '../models/request-wrapper/RequestWrapperResponse' import { RequestWrapperResponse } from '../models/request-wrapper/RequestWrapperResponse'
import { ParseResult } from '../models/ParseResult.interface' import { ParseResult } from '../models/ParseResult.interface'
import XLSX from 'xlsx'
enum FileLoadingState {
reading = 'Reading the file',
parsing = 'Searching for the data in the file',
parsed = 'Searching for the data finished',
/**
* Defualt value
*/
notSelected = 'File not selected'
}
@Component({ @Component({
selector: 'app-multi-dataset', selector: 'app-multi-dataset',
@ -43,6 +54,11 @@ export class MultiDatasetComponent implements OnInit {
public licenceState = this.licenceService.licenceState public licenceState = this.licenceService.licenceState
public Infinity = Infinity public Infinity = Infinity
public workbookInterval: any
public fileLoadingState: FileLoadingState = FileLoadingState.notSelected
public FileLoadingState = FileLoadingState
public hotTableLicenseKey: string | undefined = undefined public hotTableLicenseKey: string | undefined = undefined
public hotTableMaxRows = public hotTableMaxRows =
this.licenceState.value.viewer_rows_allowed || Infinity this.licenceState.value.viewer_rows_allowed || Infinity
@ -163,7 +179,7 @@ export class MultiDatasetComponent implements OnInit {
} }
} }
onFileChange(event: any) { async onFileChange(event: any) {
const files = event?.target?.files || [] const files = event?.target?.files || []
if (files.length < 1) { if (files.length < 1) {
@ -200,11 +216,20 @@ export class MultiDatasetComponent implements OnInit {
// For EXCEL if multiple files, we only take one (the first one) // For EXCEL if multiple files, we only take one (the first one)
this.selectedFile = event.target.files[0] this.selectedFile = event.target.files[0]
if (this.selectedFile) if (this.selectedFile) {
this.fileLoadingState = FileLoadingState.reading
this.selectedFile.sizeMB = this.spreadsheetService.bytesToMB( this.selectedFile.sizeMB = this.spreadsheetService.bytesToMB(
this.selectedFile.size this.selectedFile.size
) )
// Read the excel file to be ready
this.spreadsheetService.xlsxReadFile(this.selectedFile!).then((wb) => {
this.fileLoadingState = FileLoadingState.parsing
this.selectedFile!.workbook = wb
})
}
this.initUserInputHot() this.initUserInputHot()
this.onAutoDetectColumns() this.onAutoDetectColumns()
} else if (matchedExtension === 'csv') { } else if (matchedExtension === 'csv') {
@ -291,10 +316,12 @@ export class MultiDatasetComponent implements OnInit {
}) })
}) })
this.workbookLoaded().then((workbook) => {
for (let parsedDataset of this.parsedDatasets) { for (let parsedDataset of this.parsedDatasets) {
this.spreadsheetService this.spreadsheetService
.parseExcelFile({ .parseExcelFile({
file: this.selectedFile!, file: this.selectedFile!,
workbook: workbook,
password: this.selectedFile!.password || undefined, password: this.selectedFile!.password || undefined,
dcValidator: parsedDataset.datasetInfo.dcValidator!, dcValidator: parsedDataset.datasetInfo.dcValidator!,
headerPks: parsedDataset.datasetInfo.headerPks, headerPks: parsedDataset.datasetInfo.headerPks,
@ -306,7 +333,7 @@ export class MultiDatasetComponent implements OnInit {
xlRules: parsedDataset.datasetInfo.xlRules xlRules: parsedDataset.datasetInfo.xlRules
}) })
.then((parseResult: ParseResult | undefined) => { .then((parseResult: ParseResult | undefined) => {
console.log('parseResult', parseResult) this.fileLoadingState = FileLoadingState.parsed
if (parseResult && parseResult.data) { if (parseResult && parseResult.data) {
let datasource: any[] = [] let datasource: any[] = []
@ -339,6 +366,7 @@ export class MultiDatasetComponent implements OnInit {
}) })
} }
}) })
})
} }
onSubmitAll() { onSubmitAll() {
@ -826,6 +854,23 @@ export class MultiDatasetComponent implements OnInit {
if (newSubmittedDataset) newSubmittedDataset.active = true if (newSubmittedDataset) newSubmittedDataset.active = true
} }
/**
*
* @returns Promise once workbook is loaded because use XLSX.read in the background
*/
private workbookLoaded(): Promise<XLSX.WorkBook> {
return new Promise((resolve, reject) => {
if (!this.selectedFile) reject('No file selected')
this.workbookInterval = setInterval(() => {
if (this.selectedFile!.workbook) {
clearInterval(this.workbookInterval)
resolve(this.selectedFile!.workbook)
}
}, 500)
})
}
private parseDatasetFromCsvName(fileName: string) { private parseDatasetFromCsvName(fileName: string) {
const fileNameArr = fileName.split('.') const fileNameArr = fileName.split('.')
fileNameArr.pop() fileNameArr.pop()
@ -1044,4 +1089,5 @@ export interface SubmittedCsvDatasetResult {
export interface SelectedFile extends File { export interface SelectedFile extends File {
sizeMB?: number sizeMB?: number
password?: string password?: string
workbook?: XLSX.WorkBook
} }

View File

@ -30,7 +30,7 @@ export class SpreadsheetService {
licenceState: this.licenceState licenceState: this.licenceState
}) })
return spreadSheetUtil.parseExcelFile( return spreadSheetUtil.parseSpreadsheetFile(
parseParams, parseParams,
this.promptExcelPassword, this.promptExcelPassword,
onParseStateChange, onParseStateChange,
@ -38,6 +38,37 @@ export class SpreadsheetService {
) )
} }
/**
* Reads the excel file using the XLSX.read() function
* If possible, function will use the web worker to read it in background thread
* otherwise fallback method will be used
*
* @param file selected in an <input>
* @returns WorkBook
*/
public xlsxReadFile(file: any): Promise<XLSX.WorkBook> {
return new Promise((resolve, reject) => {
const spreadSheetUtil = new SpreadsheetUtil({
licenceState: this.licenceState
})
let reader: FileReader = new FileReader()
reader.onload = (fileReaderResponse: any) => {
spreadSheetUtil
.xslxStartReading(fileReaderResponse, this.promptExcelPassword)
.then((response) => {
resolve(response)
})
.catch((err) => {
reject(err)
})
}
reader.readAsArrayBuffer(file)
})
}
/** /**
* Read the file minimally just to get the sheet names, not reading full file * Read the file minimally just to get the sheet names, not reading full file
* to help boost the performance * to help boost the performance

File diff suppressed because it is too large Load Diff

8
sas/package-lock.json generated
View File

@ -7,7 +7,7 @@
"name": "dc-sas", "name": "dc-sas",
"dependencies": { "dependencies": {
"@sasjs/cli": "^4.11.1", "@sasjs/cli": "^4.11.1",
"@sasjs/core": "^4.52.4" "@sasjs/core": "^4.52.5"
} }
}, },
"node_modules/@coolaj86/urequest": { "node_modules/@coolaj86/urequest": {
@ -116,9 +116,9 @@
"integrity": "sha512-Grwydm5GxBsYk238PZw41XPjXVVQ9vWcvfZ06L2P0bQbvK0sGn7l69JA7H5MGr3QcaLpiD4Kg70cAh7PgE+JOw==" "integrity": "sha512-Grwydm5GxBsYk238PZw41XPjXVVQ9vWcvfZ06L2P0bQbvK0sGn7l69JA7H5MGr3QcaLpiD4Kg70cAh7PgE+JOw=="
}, },
"node_modules/@sasjs/core": { "node_modules/@sasjs/core": {
"version": "4.52.4", "version": "4.52.5",
"resolved": "https://registry.npmjs.org/@sasjs/core/-/core-4.52.4.tgz", "resolved": "https://registry.npmjs.org/@sasjs/core/-/core-4.52.5.tgz",
"integrity": "sha512-8lf5ixlA312EgA2DorwbpNXXPfLPzUHO67exIV7SjKiU23Tn1au5GD6hT0Ysr2kophOs10Mp1TCXJjhEq7Qk4A==" "integrity": "sha512-fGuLC+DcH2AoIDDU/Eyn7d4ZIrIeQAN3PB9FgjBikiGuXZYooRELKz1WgaQvhI4qSKgczUANaT80ZJOpfH+sQQ=="
}, },
"node_modules/@sasjs/lint": { "node_modules/@sasjs/lint": {
"version": "2.3.1", "version": "2.3.1",

View File

@ -29,6 +29,6 @@
"private": true, "private": true,
"dependencies": { "dependencies": {
"@sasjs/cli": "^4.11.1", "@sasjs/cli": "^4.11.1",
"@sasjs/core": "^4.52.4" "@sasjs/core": "^4.52.5"
} }
} }

View File

@ -100,6 +100,7 @@ Areas for optimisation
@li mf_getattrn.sas @li mf_getattrn.sas
@li mf_getengine.sas @li mf_getengine.sas
@li mf_getschema.sas @li mf_getschema.sas
@li mf_getuniquefileref.sas
@li mf_getuniquename.sas @li mf_getuniquename.sas
@li mf_getuser.sas @li mf_getuser.sas
@li mf_getvarlist.sas @li mf_getvarlist.sas
@ -621,7 +622,7 @@ data work.bitemp0_append &keepvars &outds_del(drop=&md5_col )
%put DCNOTE: Extracting matching observations from &base_lib..&base_dsn; %put DCNOTE: Extracting matching observations from &base_lib..&base_dsn;
%if &engine_type=OLEDB %then %do; %if &engine_type=OLEDB %then %do;
%let temp_table=##BITEMP_&base_dsn; %let temp_table=##%mf_getuniquefileref(prefix=BTMP)_&base_dsn;
%if &loadtype=BITEMPORAL or &loadtype=TXTEMPORAL %then %if &loadtype=BITEMPORAL or &loadtype=TXTEMPORAL %then
%let base_table=(select * from [dbo].&base_dsn %let base_table=(select * from [dbo].&base_dsn
where convert(datetime,&SQLNOW) < &tech_to ); where convert(datetime,&SQLNOW) < &tech_to );
@ -1049,7 +1050,7 @@ run;
%let cat_string=catx('|' ,&bus_from,&bus_to); %let cat_string=catx('|' ,&bus_from,&bus_to);
data work.bitemp5a_lkp (keep=&md5_col) data work.bitemp5a_lkp (keep=&md5_col)
%if "%substr(&sysver,1,1)" ne "4" and "%substr(&sysver,1,1)" ne "5" %then %do; %if "%substr(&sysver,1,1)" ne "4" & "%substr(&sysver,1,1)" ne "5" %then %do;
/nonote2err /nonote2err
%end; %end;
; ;
@ -1191,10 +1192,10 @@ run;
/* if OLEDB then create a temp table for efficiency */ /* if OLEDB then create a temp table for efficiency */
%local innertable; %local innertable;
%if &engine_type=OLEDB %then %do; %if &engine_type=OLEDB %then %do;
%let innertable=[##BITEMP_&base_dsn]; %let innertable=[&temp_table];
%let top_table=[dbo].&base_dsn; %let top_table=[dbo].&base_dsn;
%let flexinow=&SQLNOW; %let flexinow=&SQLNOW;
create table &base_lib.."##BITEMP_&base_dsn"n as create table &base_lib.."&temp_table"n as
select * from work.bitemp5d_subquery; select * from work.bitemp5d_subquery;
/* open up a connection for pass through SQL */ /* open up a connection for pass through SQL */
%dc_assignlib(WRITE,&base_lib,passthru=myAlias) %dc_assignlib(WRITE,&base_lib,passthru=myAlias)