Skip to content

Commit

Permalink
Merge pull request #6 from TheJacksonLaboratory/G3-204-get-search-wor…
Browse files Browse the repository at this point in the history
…king

G3-204: Get Legacy Search Working
  • Loading branch information
bergsalex committed Jun 18, 2024
2 parents cd89146 + 9e88dca commit cdef14c
Show file tree
Hide file tree
Showing 7 changed files with 428 additions and 2 deletions.
6 changes: 6 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
deploy/
.github/
docs/
skaffold.yaml
.gitignore
.git
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ ENV PATH="${POETRY_HOME}/bin:${PATH}"

WORKDIR /app

COPY sample-configs/k8s/sphinx /app/sphinx/

COPY pyproject.toml poetry.lock README.md /app/

RUN poetry install --sync --no-root
Expand Down
9 changes: 8 additions & 1 deletion deploy/k8s/base/deployment.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,11 @@ spec:
- secretRef:
name: geneweaver-legacy-secrets
ports:
- containerPort: 8000
- containerPort: 8000
- name: geneweaver-legacy-search
image: geneweaver-legacy
imagePullPolicy: Always
envFrom:
- secretRef:
name: geneweaver-db
command: ["/bin/bash", "-c", "/app/sphinx/start_sphinx.sh"]
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "geneweaver-legacy"
version = "1.1.1"
version = "1.2.0"
description = ""
authors = ["Alexander Berger <[email protected]>"]
readme = "README.md"
Expand Down
291 changes: 291 additions & 0 deletions sample-configs/k8s/sphinx/sphinx.conf
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@

source geneset_src : base
{
# potentially delete old entry, first
sql_query_pre = SET search_path to production,extsrc,odestatic;
sql_query_pre = DELETE FROM sphinxcounters WHERE index_name='geneset_tmp';
sql_query_pre = INSERT INTO sphinxcounters VALUES ('geneset_tmp', NOW());

sql_query_range = \
SELECT min(gs.gs_id), max(gs.gs_id) \
FROM geneset gs \
WHERE gs.gs_status<>'deleted' \
AND gs.gs_updated < ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp');
sql_range_step = 1000

sql_query = \
SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \
gs.gs_name as name, gs.gs_description as description, \
gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \
p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \
sp.sp_name as species, sp.sp_taxid as taxid, \
COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \
COALESCE(gs.gs_attribution, 0) as attribution, \
CASE \
WHEN gs.gs_status='provisional' THEN 1 \
WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \
ELSE 0 \
END AS gs_status, \
CASE \
WHEN gs.sp_id=1 THEN 'mouse' \
WHEN gs.sp_id=2 THEN 'human' \
WHEN gs.sp_id=3 THEN 'rat' \
WHEN gs.sp_id=4 THEN 'zebrafish' \
WHEN gs.sp_id=5 THEN 'fly' \
WHEN gs.sp_id=6 THEN 'monkey' \
WHEN gs.sp_id=8 THEN 'c. elegans' \
WHEN gs.sp_id=9 THEN 'yeast' \
END AS common_name \
FROM geneset gs \
LEFT OUTER JOIN publication p USING(pub_id) \
LEFT OUTER JOIN species sp USING(sp_id) \
WHERE gs.gs_status<>'deleted' \
AND gs.gs_id>=$start AND gs.gs_id<=$end \
AND gs.gs_updated < ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp');

# gene ref ids, gene names, and type
# ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \

# just gene ref ids
sql_joined_field = genes from query; \
SELECT gs.gs_id, ode_ref_id \
FROM geneset gs \
JOIN geneset_value USING(gs_id) \
JOIN gene USING(ode_gene_id) \
JOIN gene_info USING(ode_gene_id) \
WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \
AND gs.gs_updated < ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \
ORDER BY gs.gs_id ASC;

# add in ontology term associations
sql_joined_field = ontologies from query; \
SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \
FROM ontology o, geneset_ontology gso, geneset gs \
WHERE gso.gso_ref_type!='Blacklist' \
AND gso.ont_id=o.ont_id \
AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \
AND gs.gs_updated < ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset_tmp') \
ORDER BY gs_id ASC;

# both indexed and attributed
sql_field_string = common_name
sql_field_string = species
sql_field_string = taxid
sql_field_string = pubmed_id

sql_attr_uint = sp_id
sql_attr_uint = usr_id
sql_attr_uint = cur_id
sql_attr_uint = attribution
sql_attr_uint = gs_status
sql_attr_uint = gs_count
sql_attr_uint = gs_id_attr

sql_attr_multi = uint grp_id from query; \
SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \
FROM production.geneset \
WHERE gs_status<>'deleted' AND (gs_groups <> '') IS NOT FALSE AND gs_updated < ( \
SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset_tmp') \
ORDER BY gs_id ASC;

sql_query_post_index = SET search_path to production,extsrc,odestatic;
sql_query_post_index = DELETE FROM sphinxcounters WHERE index_name='geneset';
sql_query_post_index = UPDATE sphinxcounters \
SET index_name='geneset' WHERE index_name='geneset_tmp';
sql_query_post_index = DELETE FROM genesetklist WHERE ts < ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset');
}

source geneset_delta_src : base
{
sql_query_pre = SET search_path to production,extsrc,odestatic;

sql_query = \
SELECT gs.gs_id, gs.gs_id, gs.gs_id as gs_id_attr, 'GS'||gs.gs_id AS gsid_prefixed, \
gs.gs_name as name, gs.gs_description as description, \
gs.gs_abbreviation as label, gs.usr_id, gs.gs_count, \
p.pub_pubmed AS pubmed_id, p.pub_authors, p.pub_title, p.pub_abstract, p.pub_journal, \
sp.sp_name as species, sp.sp_taxid as taxid, \
COALESCE(gs.cur_id, 0) AS cur_id, COALESCE(gs.sp_id, 0) AS sp_id, \
COALESCE(gs.gs_attribution, 0) as attribution, \
CASE \
WHEN gs.gs_status='provisional' THEN 1 \
WHEN gs.gs_status LIKE 'deprecated%' THEN 2 \
ELSE 0 \
END AS gs_status, \
CASE \
WHEN gs.sp_id=1 THEN 'mouse' \
WHEN gs.sp_id=2 THEN 'human' \
WHEN gs.sp_id=3 THEN 'rat' \
WHEN gs.sp_id=4 THEN 'zebrafish' \
WHEN gs.sp_id=5 THEN 'fly' \
WHEN gs.sp_id=6 THEN 'monkey' \
WHEN gs.sp_id=8 THEN 'c. elegans' \
WHEN gs.sp_id=9 THEN 'yeast' \
END AS common_name \
FROM geneset gs \
LEFT OUTER JOIN publication p USING(pub_id) \
LEFT OUTER JOIN species sp USING(sp_id) \
WHERE gs.gs_status<>'deleted' \
AND gs.gs_updated >= ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset');

# gene ref ids, gene names, and type
# ode_ref_id||' '||COALESCE(gi_name, '')||' '||COALESCE(gi_type, '') \

# just gene ref ids
sql_joined_field = genes from query; \
SELECT gs.gs_id, ode_ref_id \
FROM geneset gs \
JOIN geneset_value USING(gs_id) \
JOIN gene USING(ode_gene_id) \
JOIN gene_info USING(ode_gene_id) \
WHERE gs.gs_status<>'deleted' AND gsv_in_threshold AND ode_pref \
AND gs.gs_updated >= ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
ORDER BY gs.gs_id ASC;

# add in ontology term associations
sql_joined_field = ontologies from query; \
SELECT gso.gs_id, o.ont_ref_id||' '||o.ont_name||' '||o.ont_description \
FROM ontology o, geneset_ontology gso, geneset gs \
WHERE gso.gso_ref_type!='Blacklist' \
AND gso.ont_id=o.ont_id \
AND gso.gs_id=gs.gs_id AND gs.gs_status<>'deleted' \
AND gs.gs_updated >= ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
ORDER BY gs_id ASC;

# both indexed and attributed
sql_field_string = common_name
sql_field_string = species
sql_field_string = taxid
sql_field_string = pubmed_id

sql_attr_uint = sp_id
sql_attr_uint = usr_id
sql_attr_uint = cur_id
sql_attr_uint = attribution
sql_attr_uint = gs_status
sql_attr_uint = gs_count
sql_attr_uint = gs_id_attr

sql_attr_multi = uint grp_id from query; \
SELECT gs_id, regexp_split_to_table(gs_groups, ',')::integer AS grp_id \
FROM production.geneset \
WHERE gs_status<>'deleted' AND gs_updated >= ( \
SELECT last_update FROM production.sphinxcounters WHERE index_name='geneset') \
ORDER BY gs_id ASC;

sql_query_killlist = \
SELECT gs_id FROM geneset WHERE gs_updated >= ( \
SELECT last_update FROM sphinxcounters WHERE index_name='geneset') \
UNION SELECT gs_id from genesetklist;
}

#############################################################################
## index definitions
#############################################################################

index geneset
{
source = geneset_src
path = /app/sphinx/geneset_idx

morphology = stem_en

# wordforms file, in "mapfrom > mapto" plain text format
# optional, default is empty
#
# wordforms = ../var/data/wordforms.txt

# minimum indexed word length; default is 1 (index everything)
min_word_len = 1

html_strip = 1
index_exact_words = 1

stopwords = /app/sphinx/stopwords.txt

min_prefix_len = 1
enable_star = 1 # allow for partial results
}

index geneset_delta
{
source = geneset_delta_src
path = /app/sphinx/geneset_delta_idx

morphology = stem_en

# wordforms file, in "mapfrom > mapto" plain text format
# optional, default is empty
#
# wordforms = ../var/data/wordforms.txt

# minimum indexed word length; default is 1 (index everything)
min_word_len = 1

html_strip = 1
index_exact_words = 1

stopwords = /app/sphinx/stopwords.txt

min_prefix_len = 1
enable_star = 1 # allow for partial results
}

#############################################################################
## indexer settings
#############################################################################

indexer
{
mem_limit = 2048M
}

#############################################################################
## searchd settings
#############################################################################

searchd
{
# hostname, port, or hostname:port, or /unix/socket/path to listen on
# multi-value, multiple listen points are allowed
# optional, default is 0.0.0.0:9312 (listen on all interfaces, port 9312)
#
# listen = 127.0.0.1
# listen = 192.168.0.1:9312
# listen = 9312
# listen = /var/run/searchd.sock


# log file, searchd run info is logged here
# optional, default is 'searchd.log'
log = /app/sphinx/sphinx-searchd.log

# query log file, all search queries are logged here
# optional, default is empty (do not log queries)
query_log = /app/sphinx/sphinx-query.log

# maximum amount of children to fork (concurrent searches to run)
# optional, default is 0 (unlimited)
max_children = 30

# PID file, searchd process ID file name
# mandatory
pid_file = /app/sphinx/sphinx-searchd.pid

# max amount of matches the daemon ever keeps in RAM, per-index
# WARNING, THERE'S ALSO PER-QUERY LIMIT, SEE SetLimits() API CALL
# default is 1000 (just like Google)
max_matches = 1000
max_packet_size = 64M

# avoid deprecation warning
#compat_sphinxql_magics = 0
}
17 changes: 17 additions & 0 deletions sample-configs/k8s/sphinx/start_sphinx.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
echo "source base"
echo "{"
echo " type = pgsql"
echo " sql_host = $DB_HOST"
echo " sql_user = $DB_USERNAME"
echo " sql_pass = $DB_PASSWORD"
echo " sql_db = $DB_NAME"
echo "}"
cat /app/sphinx/sphinx.conf
} > /app/sphinx/sphinx.conf.new

mv /app/sphinx/sphinx.conf.new /app/sphinx/sphinx.conf

indexer --all --config /app/sphinx/sphinx.conf

searchd --nodetach --config /app/sphinx/sphinx.conf
Loading

0 comments on commit cdef14c

Please sign in to comment.