Merge pull request #6 from TheJacksonLaboratory/G3-204-get-search-wor…

…king G3-204: Get Legacy Search Working
TheJacksonLaboratory · Jun 18, 2024 · cdef14c · cdef14c
2 parents cd89146 + 9e88dca
commit cdef14c
Show file tree

Hide file tree

Showing 7 changed files with 428 additions and 2 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,6 @@
+deploy/
+.github/
+docs/
+skaffold.yaml
+.gitignore
+.git
diff --git a/Dockerfile b/Dockerfile
@@ -24,6 +24,8 @@ ENV PATH="${POETRY_HOME}/bin:${PATH}"
 
 WORKDIR /app
 
+COPY sample-configs/k8s/sphinx /app/sphinx/
+
 COPY pyproject.toml poetry.lock README.md /app/
 
 RUN poetry install --sync --no-root

diff --git a/deploy/k8s/base/deployment.yaml b/deploy/k8s/base/deployment.yaml
@@ -33,4 +33,11 @@ spec:
             - secretRef:
                 name: geneweaver-legacy-secrets
           ports:
-            - containerPort: 8000
+            - containerPort: 8000
+        - name: geneweaver-legacy-search
+          image: geneweaver-legacy
+          imagePullPolicy: Always
+          envFrom:
+            - secretRef:
+                name: geneweaver-db
+          command: ["/bin/bash", "-c", "/app/sphinx/start_sphinx.sh"]
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "geneweaver-legacy"
-version = "1.1.1"
+version = "1.2.0"
 description = ""
 authors = ["Alexander Berger <[email protected]>"]
 readme = "README.md"

diff --git a/sample-configs/k8s/sphinx/sphinx.conf b/sample-configs/k8s/sphinx/sphinx.conf
@@ -0,0 +1,291 @@
+
+source geneset_src : base
+{
+  # potentially delete old entry, first
+  sql_query_pre = SET search_path to production,extsrc,odestatic;
+  sql_query_pre = DELETE FROM sphinxcounters WHERE index_name='geneset_tmp';
+  sql_query_pre = INSERT INTO sphinxcounters VALUES ('geneset_tmp', NOW());
+
+  sql_query_range = \
+    SELECT min(gs.gs_id), max(gs.gs_id) \
+    FROM geneset gs \
+    WHERE gs.gs_status<>'deleted' \
+    AND gs.gs_updated < ( \
+      SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp');
+  sql_range_step = 1000
+
+  sql_query = \
+    SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \
+      gs.gs_name as name, gs.gs_description as description, \
+      gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \
+      p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \
+      sp.sp_name as species, sp.sp_taxid as taxid, \
+      COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \
+      COALESCE(gs.gs_attribution, 0) as attribution, \
+      CASE \
+        WHEN gs.gs_status='provisional' THEN 1 \
+        WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \
+        ELSE 0 \
+      END AS gs_status, \
+      CASE \
+        WHEN gs.sp_id=1 THEN 'mouse' \
+        WHEN gs.sp_id=2 THEN 'human' \
+        WHEN gs.sp_id=3 THEN 'rat' \
+        WHEN gs.sp_id=4 THEN 'zebrafish' \
+        WHEN gs.sp_id=5 THEN 'fly' \
+        WHEN gs.sp_id=6 THEN 'monkey' \
+        WHEN gs.sp_id=8 THEN 'c. elegans' \
+        WHEN gs.sp_id=9 THEN 'yeast' \
+      END AS common_name \
+    FROM geneset gs \
+      LEFT OUTER JOIN publication p USING(pub_id) \
+      LEFT OUTER JOIN species sp USING(sp_id) \
+    WHERE gs.gs_status<>'deleted' \
+      AND gs.gs_id>=$start AND gs.gs_id<=$end \
+      AND gs.gs_updated < ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp');
+
+  # gene ref ids, gene names, and type
+  # ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \
+
+  # just gene ref ids
+  sql_joined_field = genes from query; \
+    SELECT gs.gs_id, ode_ref_id \
+    FROM geneset gs \
+      JOIN geneset_value USING(gs_id) \
+      JOIN gene USING(ode_gene_id) \
+      JOIN gene_info USING(ode_gene_id) \
+    WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \
+      AND gs.gs_updated < ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \
+    ORDER BY gs.gs_id ASC;
+
+  # add in ontology term associations
+  sql_joined_field = ontologies from query; \
+    SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \
+    FROM ontology o, geneset_ontology gso, geneset gs \
+    WHERE gso.gso_ref_type!='Blacklist' \
+      AND gso.ont_id=o.ont_id \
+      AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \
+      AND gs.gs_updated < ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \
+    ORDER BY gs_id ASC;
+
+  # both indexed and attributed
+  sql_field_string = common_name
+  sql_field_string = species
+  sql_field_string = taxid
+  sql_field_string = pubmed_id
+
+  sql_attr_uint = sp_id
+  sql_attr_uint = usr_id
+  sql_attr_uint = cur_id 
+  sql_attr_uint = attribution
+  sql_attr_uint = gs_status
+  sql_attr_uint = gs_count
+  sql_attr_uint = gs_id_attr
+
+  sql_attr_multi = uint grp_id from query; \
+    SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \
+    FROM production.geneset \
+    WHERE gs_status<>'deleted' AND (gs_groups <> '') IS NOT FALSE AND gs_updated < ( \
+      SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset_tmp') \
+    ORDER BY gs_id ASC;
+
+  sql_query_post_index = SET search_path to production,extsrc,odestatic;
+  sql_query_post_index = DELETE FROM sphinxcounters WHERE index_name='geneset';
+  sql_query_post_index = UPDATE sphinxcounters \
+    SET index_name='geneset' WHERE index_name='geneset_tmp';
+  sql_query_post_index = DELETE FROM genesetklist WHERE ts < ( \
+    SELECT last_update FROM sphinxcounters WHERE index_name='geneset');
+}
+
+source geneset_delta_src : base
+{
+  sql_query_pre = SET search_path to production,extsrc,odestatic;
+
+  sql_query = \
+    SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \
+      gs.gs_name as name, gs.gs_description as description, \
+      gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \
+      p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \
+      sp.sp_name as species, sp.sp_taxid as taxid, \
+      COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \
+      COALESCE(gs.gs_attribution, 0) as attribution, \
+      CASE \
+        WHEN gs.gs_status='provisional' THEN 1 \
+        WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \
+        ELSE 0 \
+      END AS gs_status, \
+      CASE \
+        WHEN gs.sp_id=1 THEN 'mouse' \
+        WHEN gs.sp_id=2 THEN 'human' \
+        WHEN gs.sp_id=3 THEN 'rat' \
+        WHEN gs.sp_id=4 THEN 'zebrafish' \
+        WHEN gs.sp_id=5 THEN 'fly' \
+        WHEN gs.sp_id=6 THEN 'monkey' \
+        WHEN gs.sp_id=8 THEN 'c. elegans' \
+        WHEN gs.sp_id=9 THEN 'yeast' \
+      END AS common_name \
+    FROM geneset gs \
+      LEFT OUTER JOIN publication p USING(pub_id) \
+      LEFT OUTER JOIN species sp USING(sp_id) \
+    WHERE gs.gs_status<>'deleted' \
+      AND gs.gs_updated >= ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset');
+
+  # gene ref ids, gene names, and type
+  # ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \
+
+  # just gene ref ids
+  sql_joined_field = genes from query; \
+    SELECT gs.gs_id, ode_ref_id \
+    FROM geneset gs \
+      JOIN geneset_value USING(gs_id) \
+      JOIN gene USING(ode_gene_id) \
+      JOIN gene_info USING(ode_gene_id) \
+    WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \
+      AND gs.gs_updated >= ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
+    ORDER BY gs.gs_id ASC;
+
+  # add in ontology term associations
+  sql_joined_field = ontologies from query; \
+    SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \
+    FROM ontology o, geneset_ontology gso, geneset gs \
+    WHERE gso.gso_ref_type!='Blacklist' \
+      AND gso.ont_id=o.ont_id \
+      AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \
+      AND gs.gs_updated >= ( \
+        SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
+    ORDER BY gs_id ASC;
+
+  # both indexed and attributed
+  sql_field_string = common_name
+  sql_field_string = species
+  sql_field_string = taxid
+  sql_field_string = pubmed_id
+
+  sql_attr_uint = sp_id
+  sql_attr_uint = usr_id
+  sql_attr_uint = cur_id 
+  sql_attr_uint = attribution
+  sql_attr_uint = gs_status
+  sql_attr_uint = gs_count
+  sql_attr_uint = gs_id_attr
+
+  sql_attr_multi = uint grp_id from query; \
+    SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \
+    FROM production.geneset \
+    WHERE gs_status<>'deleted' AND gs_updated >= ( \
+      SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset') \
+    ORDER BY gs_id ASC;
+
+  sql_query_killlist = \
+    SELECT gs_id FROM geneset WHERE gs_updated >= ( \
+      SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
+    UNION SELECT gs_id from genesetklist;
+}
+
+#############################################################################
+## index definitions
+#############################################################################
+
+index geneset
+{
+  source = geneset_src
+  path = /app/sphinx/geneset_idx
+
+  morphology = stem_en
+
+  # wordforms file, in "mapfrom > mapto" plain text format
+  # optional, default is empty
+  #
+  # wordforms      = ../var/data/wordforms.txt
+
+  # minimum indexed word length; default is 1 (index everything)
+  min_word_len = 1
+
+  html_strip = 1
+  index_exact_words = 1
+
+  stopwords = /app/sphinx/stopwords.txt
+
+  min_prefix_len = 1
+  enable_star = 1 # allow for partial results
+}
+
+index geneset_delta
+{
+  source = geneset_delta_src
+  path = /app/sphinx/geneset_delta_idx
+
+  morphology = stem_en
+
+  # wordforms file, in "mapfrom > mapto" plain text format
+  # optional, default is empty
+  #
+  # wordforms      = ../var/data/wordforms.txt
+
+  # minimum indexed word length; default is 1 (index everything)
+  min_word_len = 1
+
+  html_strip = 1
+  index_exact_words = 1
+
+  stopwords = /app/sphinx/stopwords.txt
+
+  min_prefix_len = 1
+  enable_star = 1 # allow for partial results
+}
+
+#############################################################################
+## indexer settings
+#############################################################################
+
+indexer
+{
+  mem_limit = 2048M
+}
+
+#############################################################################
+## searchd settings
+#############################################################################
+
+searchd
+{
+  # hostname, port, or hostname:port, or /unix/socket/path to listen on
+  # multi-value, multiple listen points are allowed
+  # optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312)
+  #
+  # listen        = 127.0.0.1
+  # listen        = 192.168.0.1:9312
+  # listen        = 9312
+  # listen        = /var/run/searchd.sock
+
+
+  # log file, searchd run info is logged here
+  # optional, default is 'searchd.log'
+  log          = /app/sphinx/sphinx-searchd.log
+
+  # query log file, all search queries are logged here
+  # optional, default is empty (do not log queries)
+  query_log      = /app/sphinx/sphinx-query.log
+
+  # maximum amount of children to fork (concurrent searches to run)
+  # optional, default is 0 (unlimited)
+  max_children    = 30
+
+  # PID file, searchd process ID file name
+  # mandatory
+  pid_file      = /app/sphinx/sphinx-searchd.pid
+
+  # max amount of matches the daemon ever keeps in RAM, per-index
+  # WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
+  # default is 1000 (just like Google)
+  max_matches      = 1000
+  max_packet_size  = 64M
+
+  # avoid deprecation warning
+  #compat_sphinxql_magics = 0
+}
diff --git a/sample-configs/k8s/sphinx/start_sphinx.sh b/sample-configs/k8s/sphinx/start_sphinx.sh
@@ -0,0 +1,17 @@
+{
+  echo "source base"
+  echo "{"
+  echo "  type = pgsql"
+  echo "  sql_host = $DB_HOST"
+  echo "  sql_user = $DB_USERNAME"
+  echo "  sql_pass = $DB_PASSWORD"
+  echo "  sql_db = $DB_NAME"
+  echo "}"
+  cat /app/sphinx/sphinx.conf
+} > /app/sphinx/sphinx.conf.new
+
+mv /app/sphinx/sphinx.conf.new /app/sphinx/sphinx.conf
+
+indexer --all --config /app/sphinx/sphinx.conf
+
+searchd --nodetach --config /app/sphinx/sphinx.conf