Merge branch 'ui-v2' of https://github.com/IGS/gEAR into ui-v2

IGS · Sep 4, 2024 · 01ada54 · 01ada54
2 parents d808097 + 04b2021
commit 01ada54
Show file tree

Hide file tree

Showing 5 changed files with 91 additions and 36 deletions.
diff --git a/www/cgi/process_uploaded_expression_dataset.cgi b/www/cgi/process_uploaded_expression_dataset.cgi
@@ -14,20 +14,22 @@ status.json
     "progress": 0
 }
 
-Where status can be 'extracting', 'processing', 'error', or 'complete'.
+Where status can be 'uploaded', 'extracting', 'processing', 'error', or 'complete'.
 """
 
 import cgi
 import json
 import os, sys
+import time
 
 import pandas as pd
 import scanpy as sc
 from scipy import sparse
 
 # This has a huge dependency stack of libraries. Occasionally, one of them has methods
-#  which prints debugging information on STDERR, killing this CGI.  So here we redirect
+#  which prints debugging information on STDOUT, killing this CGI.  So here we redirect
 #  STDOUT until we need it.
+print('Content-Type: application/json\n\n', flush=True)
 original_stdout = sys.stdout
 sys.stdout = open(os.devnull, 'w')
 
@@ -40,7 +42,7 @@ session_id = None
 user_upload_file_base = '../uploads/files'
 
 status = {
-    "process_id": os.getpid(),
+    "process_id": None,
     "status": "extracting",
     "message": "",
     "progress": 0
@@ -58,50 +60,72 @@ def main():
     user = geardb.get_user_from_session_id(session_id)
     if user is None:
         result['message'] = 'User ID not found. Please log in to continue.'
-        print_and_go(json.dumps(result))
+        print_and_go(None, json.dumps(result))
 
     # values are mex_3tab, excel, rdata, h5ad
     dataset_formats = ['mex_3tab', 'excel', 'rdata', 'h5ad']
     dataset_format = form.getvalue('dataset_format')
     dataset_upload_dir = os.path.join(user_upload_file_base, session_id, share_uid)
 
+    # quickly write the status so the page doesn't error out
+    status_file = os.path.join(dataset_upload_dir, 'status.json')
+    with open(status_file, 'w') as f:
+        f.write(json.dumps(status))
+
     # if the upload directory doesn't exist, we can't process the dataset
     if not os.path.exists(dataset_upload_dir):
         result['message'] = 'Dataset/directory not found.'
-        print_and_go(json.dumps(result))
+        print_and_go(status_file, json.dumps(result))
 
     if dataset_format not in dataset_formats:
         result['message'] = 'Unsupported dataset format.'
-        print_and_go(json.dumps(result))
+        print_and_go(status_file, json.dumps(result))
 
     # Since this process can take a while, we want to fork off of apache and continue
-    #  processing in the background.  We'll write the status to a file in the same
-    #  directory as the dataset.
-    status_file = os.path.join(dataset_upload_dir, 'status.json')
+    #  processing in the background.
     with open(status_file, 'w') as f:
         f.write(json.dumps(status))
 
     ###############################################
     # This is the fork off of apache
     # https://stackoverflow.com/a/22181041/1368079
-    #sys.stdout.flush()
-    #os.close(sys.stdout.fileno()) # Break web pipe
-    #sys.stderr.flush()
-    #os.close(sys.stderr.fileno()) # Break web pipe
-    #if os.fork(): # Get out parent process
-    #    result['success'] = 1
-    #    print_and_go(json.dumps(result))
-    ###############################################
+    # https://stackoverflow.com/questions/6024472/start-background-process-daemon-from-cgi-script
+    sys.stdout = original_stdout
+    result['success'] = 1
+    print(json.dumps(result))
+
+    sys.stdout.flush()
+    os.close(sys.stdout.fileno()) # Break web pipe
+    sys.stderr.flush()
+    os.close(sys.stderr.fileno()) # Break web pipe
+
+    if os.fork(): # Get out of parent process
+        sys.exit(0)
+
+    # open a log file in /tmp
+    #f_out = open('/tmp/apache.stdout.log', 'w')
+    #f_err = open('/tmp/apache.stderr.log', 'w')
+
+    time.sleep(1)  # Be sure the parent process reach exit command.
+    os.setsid() # Become process group leader
 
+    status['process_id'] = os.getpid()
+
+    # new child command
     if dataset_format == 'mex_3tab':
-        process_mex_3tab(dataset_upload_dir)
+            process_mex_3tab(dataset_upload_dir)
+
+
 
-    print_and_go(json.dumps(result))
 
-def print_and_go(content):
+def print_and_go(status_file, content):
     sys.stdout = original_stdout
-    print('Content-Type: application/json\n\n', flush=True)
     print(content)
+
+    if status_file is not None:
+        with open(status_file, 'w') as f:
+            f.write(json.dumps(status))
+
     sys.exit(0)
 
 def process_3tab(upload_dir):
@@ -143,12 +167,8 @@ def process_3tab(upload_dir):
             obs[num_type] = pd.to_numeric(obs[num_type])
 
     # Read in expressions as AnnData object in a memory-efficient manner
-    #print("Creating AnnData object with obs and var", file=sys.stderr, flush=True)
     adata = sc.AnnData(obs=var, var=obs)
-    #print("Reading expression matrix file: {0}".format(expression_matrix_path), file=sys.stderr, flush=True)
-
     reader = pd.read_csv(expression_matrix_path, sep='\t', index_col=0, chunksize=chunk_size)
-    #adata.X = sparse.vstack([sparse.csr_matrix(chunk.values) for chunk in reader])
 
     # This can be an order of magnitude faster than the using python alone
     total_rows = int(subprocess.check_output(f"/usr/bin/wc -l {expression_matrix_path}", shell=True).split()[0])
@@ -159,7 +179,7 @@ def process_3tab(upload_dir):
         rows_read += chunk_size
         percentage = int((rows_read / total_rows) * 100)
         expression_matrix.append(sparse.csr_matrix(chunk.values))
-        print(f"Chunks read: {rows_read}/{total_rows}", file=sys.stderr, flush=True)
+
         status['progress'] = percentage
         status['message'] = f"Processed {rows_read}/{total_rows} expression matrix chunks ..."
         with open(os.path.join(upload_dir, 'status.json'), 'w') as f:
@@ -192,7 +212,22 @@ def process_mex_3tab(upload_dir):
     with tarfile.open(filename) as tf:
         for entry in tf:
             tf.extract(entry, path=upload_dir)
-            files_extracted.append(entry.name)
+
+            # Nemo suffixes
+            nemo_suffixes = ['DataMTX.tab', 'COLmeta.tab', 'ROWmeta.tab']
+            suffix_found = None
+
+            for suffix in nemo_suffixes:
+                if entry.name.endswith(suffix):
+                    suffix_found = suffix
+                    # Rename the file to the appropriate name
+                    os.rename(os.path.join(upload_dir, entry.name), 
+                              os.path.join(upload_dir, suffix))
+
+            if suffix_found is not None:
+                files_extracted.append(suffix_found)
+            else:
+                files_extracted.append(entry.name)
 
     # Determine the dataset type
     dataset_type = tarball_content_type(files_extracted)
@@ -233,7 +268,7 @@ def tarball_content_type(filenames):
 
         if 'matrix.mtx' in filenames and 'barcodes.tsv' in filenames and 'genes.tsv' in filenames:
             return 'mex'
-        
+
         if 'DataMTX.tab' in filenames and 'COLmeta.tab' in filenames and 'ROWmeta.tab' in filenames:
             return 'threetab'
 

diff --git a/www/cgi/store_expression_dataset.cgi b/www/cgi/store_expression_dataset.cgi
@@ -36,6 +36,7 @@ def main():
     # This should already have been created when the metadata was stored
     user_upload_file_base = "../uploads/files/{0}".format(session_id)
     dataset_filename = os.path.join(user_upload_file_base, share_uid, share_uid + '.' + file_extension)
+    status_file = os.path.join(user_upload_file_base, share_uid, 'status.json')
 
     if not user:
         result['message'] = 'Only logged in users can upload datasets.'
@@ -61,6 +62,16 @@ def main():
         result['success'] = 1
         result['message'] = 'Dataset file saved successfully.'
 
+        status = {
+            "process_id": None,
+            "status": "uploaded",
+            "message": "The dataset has been uploaded and is pending processing",
+            "progress": 0
+        }
+
+        with open(status_file, 'w') as f:
+            f.write(json.dumps(status))
+
     except Exception as e:
         result['message'] = 'Error saving dataset file: ' + str(e)
 

diff --git a/www/js/common.v2.js b/www/js/common.v2.js
@@ -1313,12 +1313,6 @@ const apiCallsMixin = {
         return data;
     },
 
-    async processDatasetUpload(formData) {
-        formData.append("session_id", this.sessionId);
-        const {data} = await axios.post("/cgi/process_uploaded_expression_dataset.cgi", formData);
-        return data;
-    },
-
     /**
      * Renames a dataset collection.
      *

diff --git a/www/js/upload_dataset.js b/www/js/upload_dataset.js
@@ -465,7 +465,20 @@ const processDataset = async () => {
     const formData = new FormData();
     formData.append('share_uid', share_uid);
     formData.append('dataset_format', dataset_format);
-    const data = await apiCallsMixin.processDatasetUpload(formData);
+    formData.append('session_id', CURRENT_USER.session_id);
+
+    const xhr = new XMLHttpRequest();
+    xhr.open('POST', './cgi/process_uploaded_expression_dataset.cgi', true);
+
+    xhr.onload = function() {
+        const response = JSON.parse(xhr.responseText);
+
+        if (response.success) {
+            // Nothing really to do here since status checking happening elsewhere
+        }
+    }
+
+    xhr.send(formData);
 }
 
 const validateMetadataForm = () => {

diff --git a/www/upload_dataset.html b/www/upload_dataset.html
@@ -768,7 +768,9 @@ <h3>H5AD / Python</h3>
                 <div id="step-process-dataset-c" class="step-c is-hidden">
                   <h1>Step - Process dataset</h1>
                   <p>
-                    Your dataset is being processed on the server. This may take a few minutes, depending on the size of the dataset.
+                    Your dataset is being processed on the server. This may take a few minutes, depending on the 
+                    size of the dataset. You can close your browser at any time and return to the uploader to
+                    check on the progress.
                   </p>
                   <hr />
                   <div class="columns mb-0 pb-0">