/* Copyright (c) 2006 Pierre Lindenbaum PhD Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the ``Software''), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. The name of the authors when specified in the source files shall be kept unmodified. THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 4XT.ORG BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. mysql> create function taxon_name returns string soname "taxonudf.o"; mysql> create function taxon_id returns integer soname "taxonudf.o"; mysql> create aggregate function taxon_com returns integer soname "taxonudf.o"; mysql> select taxon_childof(taxon_id("Homo"),taxon_id("Homo Sapiens")) as "is Homo child of Homo.Sapiens ?"; +---------------------------------+ | is Homo child of Homo.Sapiens ? | +---------------------------------+ | 0 | +---------------------------------+ 1 row in set (0,00 sec) mysql> select taxon_childof(taxon_id("Homo Sapiens"),taxon_id("Homo")) as "Is Homo.Sapiens descendant of Homo ?"; +--------------------------------------+ | Is Homo.Sapiens descendant of Homo ? | +--------------------------------------+ | 1 | +--------------------------------------+ 1 row in set (0,00 sec) mysql> create temporary table t1(cluster varchar(20),taxon int); insert into t1(cluster,taxon) values("A",251093),("A",9781), ("A",37348),("B",9605),("B",9606),("B",63221),("C",32523),("C",33154),("C",7776),("C",9443); mysql> select cluster,taxon as "ncbi-id",taxon_name(taxon) as "Name",taxon_childof(taxon,taxon_id("Primates")) as "Is_Primate" from t1; +---------+---------+-------------------------------+------------+ | cluster | ncbi-id | Name | Is_Primate | +---------+---------+-------------------------------+------------+ | A | 251093 | Elephas antiquus | 0 | | A | 9781 | Elephantidae gen. sp. | 0 | | A | 37348 | Mammuthus | 0 | | B | 9605 | Homo | 1 | | B | 9606 | Homo sapiens | 1 | | B | 63221 | Homo sapiens neanderthalensis | 1 | | C | 32523 | Tetrapoda | 0 | | C | 33154 | Fungi/Metazoa group | 0 | | C | 7776 | Gnathostomata | 0 | | C | 9443 | Primates | 1 | +---------+---------+-------------------------------+------------+ 10 rows in set (0.00 sec) mysql> select taxon,taxon_name(taxon) from t1 where taxon_childof(taxon,taxon_id("Hominidae")); +-------+-------------------------------+ | taxon | taxon_name(taxon) | +-------+-------------------------------+ | 9605 | Homo | | 9606 | Homo sapiens | | 63221 | Homo sapiens neanderthalensis | +-------+-------------------------------+ 3 rows in set (0.00 sec) mysql> select cluster,taxon_com(taxon) as ncbi_id from t1 group by cluster; +---------+---------+ | cluster | ncbi_id | +---------+---------+ | A | 9780 | | B | 9605 | | C | 33154 | +---------+---------+ 3 rows in set (0.00 sec) */ #include #include #include #include #include #include #define TAXON_NAME_SIZE 50 typedef long long taxon_type_t; typedef struct Taxonomy { taxon_type_t tax_id; taxon_type_t parent_id; char name[TAXON_NAME_SIZE]; }Taxon,*const TaxonPtr; typedef struct Lineage_t { Taxon* taxon; struct Lineage_t *next; }Lineage,*LineagePtr; static const taxon_type_t NO_TAXON=-1; /** txid63221[Organism:noexp] or txid37348[Organism:exp] or txid99490[Organism:noexp] */ static const Taxon all_taxons[]={ {1,-1,"root"}, {2759,131567,"Eukaryota"}, {6072,33208,"Eumetazoa"}, {7711,33511,"Chordata"}, {7742,89593,"Vertebrata"}, {7776,7742,"Gnathostomata"}, {8287,117571,"Sarcopterygii"}, {9347,32525,"Eutheria"}, {9443,314146,"Primates"}, {9526,314293,"Catarrhini"}, {9604,314295,"Hominidae"}, {9605,207598,"Homo"}, {9606,9605,"Homo sapiens"}, {9779,311790,"Proboscidea"}, {9780,9779,"Elephantidae"}, {9781,9780,"Elephantidae gen. sp."}, {9782,9780,"Elephas"}, {9783,9782,"Elephas maximus"}, {9784,9780,"Loxodonta"}, {9785,9784,"Loxodonta africana"}, {32523,8287,"Tetrapoda"}, {32524,32523,"Amniota"}, {32525,40674,"Theria"}, {33154,2759,"Fungi/Metazoa group"}, {33208,33154,"Metazoa"}, {33213,6072,"Bilateria"}, {33316,33213,"Coelomata"}, {33511,33316,"Deuterostomia"}, {37348,9780,"Mammuthus"}, {37349,37348,"Mammuthus primigenius"}, {39051,9780,"Mammut"}, {39053,39051,"Mammut americanum"}, {40674,32524,"Mammalia"}, {63221,9606,"Homo sapiens neanderthalensis"}, {89593,7711,"Craniata"}, {99487,9783,"Elephas maximus indicus"}, {99488,9783,"Elephas maximus maximus"}, {99490,9784,"Loxodonta cyclotis"}, {117570,7776,"Teleostomi"}, {117571,117570,"Euteleostomi"}, {131567,1,"cellular organisms"}, {169825,9784,"Loxodonta cyclotis x africana"}, {207598,9604,"Homo/Pan/Gorilla group"}, {251093,9782,"Elephas antiquus"}, {251094,251093,"Elephas antiquus falconeri"}, {311790,9347,"Afrotheria"}, {314146,9347,"Euarchontoglires"}, {314293,376913,"Simiiformes"}, {314295,9526,"Hominoidea"}, {363578,9783,"Elephas maximus asurus"}, {363579,9782,"Elephas cypriotes"}, {363580,9782,"Elephas sp. NHMC 20.2.2.1"}, {376913,9443,"Haplorrhini"} }; #define TAXON_NODE_COUNT (size_t)(sizeof(all_taxons)/sizeof(Taxon)) static int compareTaxonsOnId(const void *m1, const void *m2) { return ((TaxonPtr) m1)->tax_id - ((TaxonPtr) m2)->tax_id; } static TaxonPtr getTaxonById(taxon_type_t val) { Taxon key; key.tax_id=val; return (TaxonPtr)bsearch(&key, all_taxons, TAXON_NODE_COUNT,sizeof(Taxon), compareTaxonsOnId); } static void freeLineage(LineagePtr ptr) { if(ptr==NULL) return; if(ptr->next!=NULL) freeLineage(ptr->next); free(ptr); } static LineagePtr _getTaxonLineage(TaxonPtr taxon, LineagePtr next) { Taxon* parent=NULL; LineagePtr ptr=NULL; LineagePtr prev=NULL; if(taxon==NULL) return NULL; ptr= malloc(sizeof(Lineage)); if(ptr==NULL) return NULL; ptr->taxon=(Taxon*)taxon; ptr->next=next; parent= getTaxonById(taxon->parent_id); if(parent==NULL) return ptr; prev=_getTaxonLineage(parent,ptr); if(prev==NULL) { freeLineage(ptr); return NULL; } return prev; } static LineagePtr getTaxonLineage(TaxonPtr taxon) { return _getTaxonLineage(taxon,NULL); } static TaxonPtr _getCommonAncestor(TaxonPtr a,TaxonPtr b,int *distance) { Taxon* found=NULL; LineagePtr lineage[2]={NULL,NULL}; LineagePtr curr[2]={NULL,NULL}; LineagePtr prev[2]={NULL,NULL}; if(a==NULL || b==NULL) return NULL; lineage[0]=getTaxonLineage(a); if(lineage[0]==NULL) return NULL; lineage[1]=getTaxonLineage(b); if(lineage[1]==NULL) { freeLineage(lineage[0]); return NULL; } curr[0]=lineage[0]; curr[1]=lineage[1]; while( curr[0]!=NULL && curr[1]!=NULL && curr[0]->taxon==curr[1]->taxon) { found=curr[0]->taxon; prev[0]=curr[0]; prev[1]=curr[1]; curr[0]=curr[0]->next; curr[1]=curr[1]->next; } if(distance!=NULL) { *distance=0; while(prev[0]!=NULL && prev[0]->taxon!=a) {++(*distance);prev[0]=prev[0]->next;} while(prev[1]!=NULL && prev[1]->taxon!=b) {++(*distance);prev[1]=prev[1]->next;} } freeLineage(lineage[0]); freeLineage(lineage[1]); return found; } static TaxonPtr getCommonAncestor(TaxonPtr a,TaxonPtr b) { return _getCommonAncestor(a,b,(int*)NULL); } static int getTaxonDistance(TaxonPtr a,TaxonPtr b) { int i=0; if(_getCommonAncestor(a,b,&i)==NULL) return -1; return i; } /** taxon_name */ my_bool taxon_name_init(UDF_INIT *initid, UDF_ARGS *args, char *message); void taxon_name_deinit(UDF_INIT *initid); char *taxon_name(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error); /** taxon_distance */ my_bool taxon_childof_init(UDF_INIT *initid, UDF_ARGS *args, char *message); void taxon_childof_deinit(UDF_INIT *initid); long long taxon_childof(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error); /** taxon_name */ my_bool taxon_name_init( UDF_INIT *initid, UDF_ARGS *args, char *message ) { if (!(args->arg_count == 1 && args->arg_type[0] == INT_RESULT )) { strncpy(message,"Bad parameter expected an integer",MYSQL_ERRMSG_SIZE); return 1; } initid->maybe_null=1; initid->ptr= (char*)malloc(sizeof(char)*(TAXON_NAME_SIZE)); if(initid->ptr==NULL) { strncpy(message,"Out Of Memory",MYSQL_ERRMSG_SIZE); return 1; } return 0; } void taxon_name_deinit(UDF_INIT *initid) { if(initid->ptr!=NULL) free(initid->ptr); } char *taxon_name(UDF_INIT *initid, UDF_ARGS *args, char *result, unsigned long *length, char *is_null, char *error) { Taxon* taxon=NULL; taxon_type_t int_val = NO_TAXON; if(args->args[0]==NULL) { *is_null=1; return NULL; } int_val= *((long long*) args->args[0]); *error = 0; taxon= getTaxonById(int_val); if(taxon==NULL) { *is_null=1; return NULL; } *length = strlen(taxon->name); memcpy(initid->ptr, taxon->name,TAXON_NAME_SIZE); initid->ptr[*length]='\0'; return initid->ptr; } /** taxon_childof */ my_bool taxon_childof_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { if (!( args->arg_count == 2 && args->arg_type[0] == INT_RESULT && args->arg_type[1] == INT_RESULT)) { strncpy(message,"Bad parameter expected 2 integer",MYSQL_ERRMSG_SIZE); return 1; } initid->maybe_null=1; return 0; } void taxon_childof_deinit(UDF_INIT *initid) { //nothing } long long taxon_childof(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { Taxon* child=NULL; long long child_val = NO_TAXON; long long parent_val = NO_TAXON; if(args->args[0]==NULL || args->args[1]==NULL) { *is_null=1; return ((long long)-1); } child_val = *((long long*) args->args[0]); parent_val= *((long long*) args->args[1]); *error = 0; child= getTaxonById(child_val); if(child==NULL) { *is_null=1; return ((long long)-1); } while(child!=NULL) { if(parent_val==child->tax_id) return ((long long)1); child= getTaxonById(child->parent_id); } return ((long long)0); } /** taxon_id */ my_bool taxon_id_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { if (!( args->arg_count == 1 && args->arg_type[0] == STRING_RESULT)) { strncpy(message,"Bad parameter expected 1 string",MYSQL_ERRMSG_SIZE); return 1; } initid->max_length=TAXON_NAME_SIZE; initid->maybe_null=1; return 0; } void taxon_id_deinit(UDF_INIT *initid) { //nothing } long long taxon_id(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { int i=0; if(args->args[0]==NULL || args->lengths[0]>=TAXON_NAME_SIZE) { *is_null=1; return NO_TAXON; } //loop over the taxons to find the user's string for(i=0;i< TAXON_NODE_COUNT;++i) { if(strncasecmp(args->args[0],all_taxons[i].name,TAXON_NAME_SIZE)==0) { *is_null=0; return all_taxons[i].tax_id; } } *is_null=1; return NO_TAXON; } /** taxon_com */ typedef struct common_t { Taxon* taxon; my_bool is_error; }*CommonPtr; my_bool taxon_com_init(UDF_INIT *initid, UDF_ARGS *args, char *message) { CommonPtr data; if (!( args->arg_count == 1 && args->arg_type[0] == INT_RESULT)) { strncpy(message,"Bad parameter expected one integer",MYSQL_ERRMSG_SIZE); return 1; } initid->maybe_null=1; data= (CommonPtr)malloc(sizeof(struct common_t)); if(data==NULL) { initid->ptr=NULL; strncpy(message,"Out of memory.",MYSQL_ERRMSG_SIZE); return 1; } data->taxon=NULL; data->is_error=0; initid->ptr=(char*)data; return 0; } void taxon_com_add( UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message ) { CommonPtr data = (CommonPtr)initid->ptr; if(data->is_error==1) return; if(*is_null==0 && args->args[0]!=NULL) { taxon_type_t id = *((long long*) args->args[0]); TaxonPtr taxon= getTaxonById(id); if(taxon==NULL) { data->is_error=1; data->taxon=NULL; } else if(data->taxon==NULL) { data->taxon=taxon; } else { TaxonPtr com=getCommonAncestor(taxon,data->taxon); if(com==NULL) { data->is_error=1; data->taxon=NULL; } else { data->taxon=com; } } } else { data->is_error=1; data->taxon=NULL; } } char taxon_com_clear(UDF_INIT *initid, char *is_null, char *error) { CommonPtr data = (CommonPtr)initid->ptr; data->taxon=NULL; data->is_error=0; *is_null = 0; } void taxon_com_reset( UDF_INIT* initid, UDF_ARGS* args, char* is_null, char* message ) { taxon_com_clear( initid, is_null, message ); taxon_com_add( initid, args, is_null, message ); } void taxon_com_deinit(UDF_INIT *initid) { free(initid->ptr); } long long taxon_com(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { CommonPtr data = (CommonPtr)initid->ptr; if(data->is_error==1) { *error=1; *is_null=1; return NO_TAXON; } else if(data->taxon==NULL) { *error=0; *is_null=1; return NO_TAXON; } else { *is_null=0; return data->taxon->tax_id; } }