00001 #include "log.h"
00002 #include "format.h"
00003 #include "linestream.h"
00004 #include "numUtil.h"
00005 #include "intervalFind.h"
00006
00007
00008
00020 typedef struct {
00021 char* chromosome;
00022 int start;
00023 int end;
00024 Array sublist;
00025 } SuperInterval;
00026
00027
00028
00029 static Array intervals = NULL;
00030 static Array superIntervals = NULL;
00031 static int superIntervalAssigned = 0;
00032
00033
00034
00040 int intervalFind_getNumberOfIntervals (void)
00041 {
00042 return arrayMax (intervals);
00043 }
00044
00045
00046
00047
00054 Array intervalFind_getAllIntervals (void)
00055 {
00056 return arrayCopy (intervals);
00057 }
00058
00059
00060
00061 static void processCommaSeparatedList (Array results, char *str)
00062 {
00063 WordIter w;
00064 char *tok;
00065
00066 w = wordIterCreate (str,",",0);
00067 while (tok = wordNext (w)) {
00068 if (tok[0] == '\0') {
00069 continue;
00070 }
00071 array (results,arrayMax (results),int) = atoi (tok);
00072 }
00073 wordIterDestroy (w);
00074 }
00075
00076
00077
00107 void intervalFind_addIntervalsToSearchSpace (char* fileName, int source)
00108 {
00109 LineStream ls;
00110 WordIter w;
00111 char *line;
00112 Array subIntervalStarts;
00113 Array subIntervalEnds;
00114 SubInterval *currSubInterval;
00115 Interval *currInterval;
00116 int i;
00117
00118 if (intervals == NULL) {
00119 intervals = arrayCreate (100000,Interval);
00120 }
00121 ls = ls_createFromFile (fileName);
00122 while (line = ls_nextLine (ls)) {
00123 if (line[0] == '\0') {
00124 continue;
00125 }
00126 w = wordIterCreate (line,"\t",0);
00127 currInterval = arrayp (intervals,arrayMax (intervals),Interval);
00128 currInterval->source = source;
00129 currInterval->name = hlr_strdup (wordNext (w));
00130 currInterval->chromosome = hlr_strdup (wordNext (w));
00131 currInterval->strand = wordNext (w)[0];
00132 currInterval->start = atoi (wordNext (w));
00133 currInterval->end = atoi (wordNext (w));
00134 currInterval->subIntervalCount = atoi (wordNext (w));
00135 subIntervalStarts = arrayCreate (currInterval->subIntervalCount,int);
00136 subIntervalEnds = arrayCreate (currInterval->subIntervalCount,int);
00137 processCommaSeparatedList (subIntervalStarts,wordNext (w));
00138 processCommaSeparatedList (subIntervalEnds,wordNext (w));
00139 if (arrayMax (subIntervalStarts) != arrayMax (subIntervalEnds)) {
00140 die ("Unequal number of subIntervalStarts and subIntervalEnds");
00141 }
00142 currInterval->subIntervals = arrayCreate (currInterval->subIntervalCount,SubInterval);
00143 for (i = 0; i < currInterval->subIntervalCount; i++) {
00144 currSubInterval = arrayp (currInterval->subIntervals,arrayMax (currInterval->subIntervals),SubInterval);
00145 currSubInterval->start = arru (subIntervalStarts,i,int);
00146 currSubInterval->end = arru (subIntervalEnds,i,int);
00147 }
00148 arrayDestroy (subIntervalStarts);
00149 arrayDestroy (subIntervalEnds);
00150 wordIterDestroy (w);
00151 }
00152 ls_destroy (ls);
00153 }
00154
00155
00156
00157 static int sortIntervalsByChromosomeAndStartAndEnd (Interval *a, Interval *b)
00158 {
00159 int diff;
00160
00161 diff = strcmp (a->chromosome,b->chromosome);
00162 if (diff != 0) {
00163 return diff;
00164 }
00165 diff = a->start - b->start;
00166 if (diff != 0) {
00167 return diff;
00168 }
00169 return b->end - a->end;
00170 }
00171
00172
00173
00174 static int sortSuperIntervalsByChromosomeAndStartAndEnd (SuperInterval *a, SuperInterval *b)
00175 {
00176 int diff;
00177
00178 diff = strcmp (a->chromosome,b->chromosome);
00179 if (diff != 0) {
00180 return diff;
00181 }
00182 diff = a->start - b->start;
00183 if (diff != 0) {
00184 return diff;
00185 }
00186 return b->end - a->end;
00187 }
00188
00189
00190
00191 static void assignSuperIntervals (void)
00192 {
00193 int i,j;
00194 Interval *currInterval,*nextInterval;
00195 SuperInterval *currSuperInterval;
00196
00197 superIntervals = arrayCreate (100000,SuperInterval);
00198 arraySort (intervals,(ARRAYORDERF)sortIntervalsByChromosomeAndStartAndEnd);
00199 i = 0;
00200 while (i < arrayMax (intervals)) {
00201 currInterval = arrp (intervals,i,Interval);
00202 currSuperInterval = arrayp (superIntervals,arrayMax (superIntervals),SuperInterval);
00203 currSuperInterval->chromosome = hlr_strdup (currInterval->chromosome);
00204 currSuperInterval->start = currInterval->start;
00205 currSuperInterval->end = currInterval->end;
00206 currSuperInterval->sublist = arrayCreate (10,Interval*);
00207 array (currSuperInterval->sublist,arrayMax (currSuperInterval->sublist),Interval*) = currInterval;
00208 j = i + 1;
00209 while (j < arrayMax (intervals)) {
00210 nextInterval = arrp (intervals,j,Interval);
00211 if (strEqual (currInterval->chromosome,nextInterval->chromosome) &&
00212 currInterval->start <= nextInterval->start &&
00213 currInterval->end >= nextInterval->end) {
00214 array (currSuperInterval->sublist,arrayMax (currSuperInterval->sublist),Interval*) = nextInterval;
00215 }
00216 else {
00217 break;
00218 }
00219 j++;
00220 }
00221 i = j;
00222 }
00223 arraySort (superIntervals,(ARRAYORDERF)sortSuperIntervalsByChromosomeAndStartAndEnd);
00224 }
00225
00226
00227
00228 static void addIntervals (Array matchingIntervals, Array sublist, int start, int end)
00229 {
00230 int i;
00231 Interval *currInterval;
00232
00233 for (i = 0; i < arrayMax (sublist); i++) {
00234 currInterval = arru (sublist,i,Interval*);
00235 if (rangeIntersection (currInterval->start,currInterval->end,start,end) >= 0) {
00236 array (matchingIntervals,arrayMax (matchingIntervals),Interval*) = currInterval;
00237 }
00238 }
00239 }
00240
00241
00242
00252 Array intervalFind_getOverlappingIntervals (char* chromosome, int start, int end)
00253 {
00254 SuperInterval testSuperInterval;
00255 SuperInterval *currSuperInterval;
00256 int i,index;
00257 static Array matchingIntervals = NULL;
00258
00259 if (superIntervalAssigned == 0) {
00260 assignSuperIntervals ();
00261 superIntervalAssigned = 1;
00262 }
00263 if (matchingIntervals == NULL) {
00264 matchingIntervals = arrayCreate (20,Interval*);
00265 }
00266 else {
00267 arrayClear (matchingIntervals);
00268 }
00269 testSuperInterval.chromosome = chromosome;
00270 testSuperInterval.start = start;
00271 testSuperInterval.end = end;
00272 arrayFind (superIntervals,&testSuperInterval,&index,(ARRAYORDERF)sortSuperIntervalsByChromosomeAndStartAndEnd);
00273
00274 i = index;
00275 while (i >= 0) {
00276 currSuperInterval = arrp (superIntervals,i,SuperInterval);
00277 if (!strEqual (currSuperInterval->chromosome,chromosome) || currSuperInterval->end < start) {
00278 break;
00279 }
00280 addIntervals (matchingIntervals,currSuperInterval->sublist,start,end);
00281 i--;
00282 }
00283 i = index + 1;
00284 while (i < arrayMax (superIntervals)) {
00285 currSuperInterval = arrp (superIntervals,i,SuperInterval);
00286 if (!strEqual (currSuperInterval->chromosome,chromosome) || currSuperInterval->start > end) {
00287 break;
00288 }
00289 addIntervals (matchingIntervals,currSuperInterval->sublist,start,end);
00290 i++;
00291 }
00292 return matchingIntervals;
00293 }
00294
00295
00296
00297