suffix array's search methods (govnokod)

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
Unistring ConcatenatedUnistringContainer::prepareQueryPattern(std::string &pattern) {
auto pattern_modified = Unistring::decode(Utf8String((char *)pattern.c_str()));
for (uint32_t i = 0, len = pattern_modified.size(); i < len; i++) {
if (unichar::is_star_original(pattern_modified[i]))
pattern_modified[i] = unichar::pattern_star_code();
}
return pattern_modified;
}
std::vector<Unistring> ConcatenatedUnistringContainer::splitQueryPattern(Unistring &pattern) {
auto simple_patterns = std::vector<Unistring>();
auto cutted_pattern = pattern.slice();
while (cutted_pattern.size() > 0) {
int left_pattern_position = 0;
int right_pattern_position = cutted_pattern.size();
for (int i = 0, len = cutted_pattern.size(); i < len; i++) {
if (cutted_pattern[i] != unichar::pattern_star_code()) {
left_pattern_position = i;
break;
}
if (i == cutted_pattern.size() - 1)
return simple_patterns;
}
for (int i = cutted_pattern.size() - 1, len = left_pattern_position; i > len; i++) {
if (cutted_pattern[i] != unichar::pattern_star_code()) {
right_pattern_position = i;
break;
}
}
cutted_pattern = cutted_pattern.slice(left_pattern_position, right_pattern_position);
int star_idx = cutted_pattern.find(unichar::pattern_star_unichar());
if (star_idx == -1) {
simple_patterns.push_back(cutted_pattern.substr(0, cutted_pattern.size()));
return simple_patterns;
} else {
simple_patterns.push_back(cutted_pattern.substr(0, star_idx));
cutted_pattern.slice(star_idx + 1, cutted_pattern.size());
}
}
return simple_patterns;
}
std::pair<long, long> ConcatenatedUnistringContainer::factorBounds(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
long left_bound = -1;
long right_bound = -1;
for (uint32_t i = l, len = r; i <= len; i++) {
//std::cout << "looking for the left bound" << std::endl;
//std::cout << "ch: " << ch << ", cmp_ch: " << getUnichar(suffix_permutation[i] + used_prefix) << std::endl;
if (getUnichar(suffix_permutation[i] + used_prefix) == ch) {
left_bound = i;
//std::cout << "Then left bound is " << left_bound << std::endl;
break;
}
}
if (left_bound == -1) return std::pair<long, long>(-1, -1);
for (uint32_t i = left_bound, len = r; i <= len; i++) {
//std::cout << "looking for the right bound" << std::endl;
//std::cout << "ch: " << ch << ", cmp_ch: " << getUnichar(suffix_permutation[i] + used_prefix) << std::endl;
if (i == len || getUnichar(suffix_permutation[i + 1] + used_prefix) != ch) {
right_bound = i;
//std::cout << "Then right bound is " << right_bound << std::endl;
return std::pair<long, long>(left_bound, right_bound);
}
}
return std::pair<long, long>(-1, -1);
}
/*std::pair<long, long> ConcatenatedUnistringContainer::factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
uint32_t left_bound = l, high_bound = r;
//fix it with linear search on small size (i.e. < 20-50)
// get the start index of target number
long start_idx = -1;
long end_idx = -1;
if (r - l < 20) {
for (uint32_t i = l; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i > l) ? getUnichar(suffix_permutation[i - 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
start_idx = i;
break;
}
}
for (uint32_t i = start_idx; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i < r) ? getUnichar(suffix_permutation[i + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = i;
break;
}
}
} else {
while (left_bound <= high_bound) {
int mid = (high_bound - left_bound) / 2 + left_bound;
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid > l) ? getUnichar(suffix_permutation[mid-1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
start_idx = mid;
break;
} else if (comparer < ch) {
left_bound = mid + 1;
} else {
high_bound = mid - 1;
}
}
if (start_idx == r) return std::pair<long, long>(start_idx, start_idx);
// get the end index of target number
if (r - start_idx < 20) {
for (uint32_t i = start_idx; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i < r) ? getUnichar(suffix_permutation[i + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = i;
break;
}
}
} else {
left_bound = start_idx;
high_bound = r;
while (left_bound <= high_bound) {
int mid = (high_bound - left_bound) / 2 + left_bound;
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid < r) ? getUnichar(suffix_permutation[mid + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = mid;
break;
} else if (comparer > ch) {
high_bound = mid - 1;
} else {
left_bound = mid + 1;
}
}
}
}
if (start_idx != -1 && end_idx != -1){
return std::pair<long, long>(start_idx, end_idx);
}
return std::pair<long, long>(-1, -1);
}*/
long ConcatenatedUnistringContainer::searchLeft(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
if (l > r) return -1;
uint32_t mid = (r == l) ? r : (r - l)/2 + l;
if (used_prefix == suffix(suffix_permutation[mid]).size()) {
return searchLeft(ch, used_prefix, mid + 1, r);
}
if (used_prefix == suffix(suffix_permutation[mid - 1]).size()) {
return searchLeft(ch, used_prefix, mid, r);
}
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid > l) ? getUnichar(suffix_permutation[mid - 1] + used_prefix) : 0;
if (ch == comparer && ch != discarder) {
return mid;
} else if (comparer >= ch) {
return searchLeft(ch, used_prefix, l, mid - 1);
} else {
return searchLeft(ch, used_prefix, mid + 1, r);
}
}
long ConcatenatedUnistringContainer::searchRight(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
if (l > r) return -1;
uint32_t mid = (r == l) ? r : (r - l)/2 + l;
if (used_prefix == suffix(suffix_permutation[mid]).size()) {
return searchRight(ch, used_prefix, mid + 1, r);
}
if (used_prefix == suffix(suffix_permutation[mid + 1]).size()) {
return searchRight(ch, used_prefix, mid + 2, r);
}
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid < r) ? getUnichar(suffix_permutation[mid + 1] + used_prefix) : 0;
if (ch == comparer && ch != discarder) {
return mid;
} else if (comparer != ch) {
return searchLeft(ch, used_prefix, l, mid - 1);
} else {
return searchLeft(ch, used_prefix, mid + 1, r);
}
}
std::pair<long, long> ConcatenatedUnistringContainer::factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
long start_idx = searchLeft(ch, used_prefix, l, r);
if (start_idx == -1) return std::pair<long, long>(-1, -1);
if (start_idx == r) return std::pair<long, long>(start_idx, start_idx);
return std::pair<long, long>(start_idx, searchRight(ch, used_prefix, start_idx, r));
}
std::map<int, int> ConcatenatedUnistringContainer::simpleQueryMatching(
Unistring &simple_pattern,
std::map<int, int> prev_match,
bool left_inexact_match,
bool right_inexact_match) {
auto selection = std::map<int, int>();
QueryTask task = QueryTask(0, suffix_permutation.size() - 1, 0);
while (task.left_bound <= task.right_bound) {
auto bounds = factorBoundsBin(
simple_pattern[task.used_prefix],
task.used_prefix,
task.left_bound,
task.right_bound);
std::cout << "u_p: " << task.used_prefix << "l: " << task.left_bound << "r: " << task.right_bound << std::endl;
if (bounds.first == -1 || bounds.second == -1) {
std::cout << "Sorry, pattern was not found. 1" << std::endl;
return selection;
}
//auto first_matched_suffix = suffix(suffix_permutation[bounds.first]);
auto first_matched_suffix = suffix(suffix_permutation[bounds.second]);
if (first_matched_suffix.size() >= simple_pattern.size()) {
for (uint32_t i = bounds.second - 1; i >= bounds.first; i--) {
auto next_suffix = suffix(suffix_permutation[i]);
if (next_suffix.size() < simple_pattern.size()) {
bounds.first = i + 1;
//lcp = LCP(bounds.first, bounds.second);
//lcp = first_matched_suffix.size();
break;
}
first_matched_suffix = next_suffix;
}
}
auto lcp = LCP(bounds.first, bounds.second);
bool right_match_condition = (right_inexact_match)
? (lcp >= simple_pattern.size())
: (lcp == simple_pattern.size());
bool right_compare_condition = (right_match_condition)
? (simple_pattern.slice(task.used_prefix, 0) == first_matched_suffix.slice(task.used_prefix, simple_pattern.size()))
: (simple_pattern.slice(task.used_prefix, 0) == first_matched_suffix.slice(task.used_prefix));
if (right_match_condition && right_compare_condition) {
for (uint32_t i = bounds.first; i <= bounds.second; i++) {
auto current_suffix = suffix(suffix_permutation[i]);
bool grab_this_suffix = (right_inexact_match)
? true
: current_suffix.size() == simple_pattern.size();
if (!grab_this_suffix) break;
auto current_string_number = find_string_index(current_suffix.start_idx);
bool is_not_overlap = (prev_match.count(current_string_number)
&& prev_match[current_string_number] >= current_suffix.start_idx + simple_pattern.size());
std::cout << "p_m[" << current_string_number << "] == " << prev_match[current_string_number] << std::endl;
bool left_match_condition = (left_inexact_match)
? current_suffix.start_idx >= strings[current_string_number].start_idx
: current_suffix.start_idx == strings[current_string_number].start_idx;
if (is_not_overlap && left_match_condition) {
uint32_t max_start_pos = (selection.count(current_string_number))
? std::min(static_cast<int>(selection[current_string_number]), static_cast<int>(current_suffix.start_idx))
: current_suffix.start_idx;
selection.emplace(current_string_number, max_start_pos);
}
}
if (selection.empty())
std::cout << "Sorry, pattern was not found. Selection empty? - " << selection.empty() << std::endl;
return selection;
}
task.left_bound = bounds.first;
task.right_bound = bounds.second;
task.used_prefix = lcp;
}
std::cout << "Sorry, pattern was not found in while cycle." << std::endl;
return selection;
}
std::set<uint32_t> ConcatenatedUnistringContainer::simpleInnerQuery(Unistring &simple_pattern) {
//std::cout << "Entrance to simple inner query" << std::endl;
auto selection = std::set<uint32_t>();
QueryTask task = QueryTask(0, suffix_permutation.size() - 1, 0);
//std::cout << task << std::endl;
while(task.left_bound <= task.right_bound) {
//std::cout << task << std::endl;
//auto bounds = factorBounds(simple_pattern[task.used_prefix], task.used_prefix, task.left_bound, task.right_bound);
auto bounds = factorBoundsBin(simple_pattern[task.used_prefix], task.used_prefix, task.left_bound, task.right_bound);
//assert(bounds.first == clone_bounds.first);
//assert(bounds.second == clone_bounds.second);
//std::cout << "Current bounds: [" << bounds.first << ", " << bounds.second << "]" << std::endl;
if (bounds.first == -1 || bounds.second == -1) {
std::cout << "Sorry, pattern was not found." << std::endl;
return std::set<uint32_t>();
}
auto lcp = LCP(bounds.first, bounds.second);
if (bounds.first == bounds.second && lcp == simple_pattern.size() + 1) {
auto lone_matched_suffix = suffix(suffix_permutation[bounds.first]);
if (simple_pattern.slice(task.used_prefix, -1) ==
lone_matched_suffix.slice(task.used_prefix, lone_matched_suffix.size() - 1)) {
selection.insert(find_string_index(suffix_permutation[bounds.first]));
return selection;
}
}
if (simple_pattern.size() <= lcp) {
auto first_mathing_suffix = suffix(suffix_permutation[bounds.first]);
if (simple_pattern.slice(task.used_prefix, -1) ==
first_mathing_suffix.slice(task.used_prefix, simple_pattern.size())) {
for (uint32_t i = bounds.first; i <= bounds.second; i++) {
selection.insert(find_string_index(suffix_permutation[i]));
}
return selection;
}
}
task.left_bound = bounds.first;
task.right_bound = bounds.second;
task.used_prefix = lcp;
}
std::cout << "Sorry, I'm looking into all of my words. There haven't any results for you :(" << std::endl;
return selection;
}
std::map<int, int> ConcatenatedUnistringContainer::query(std::string &original_pattern) {
//std::cout << "Entrance to the query" << std::endl;
Unistring pattern = prepareQueryPattern(original_pattern);
//auto splitted = splitQueryPattern(pattern);
//for (int i = 0, len = splitted.size(); i < len; i++) {
// std::cout << splitted[i] << std::endl;
//}
//std::cout << "Making following pattern: \"" << pattern << "\"" << std::endl;
if (!pattern.contains_unichar(unichar::pattern_star_code()))
return simpleQueryMatching(pattern, first_result, false, false);
else {
//bool left_star_existence = (pattern[0] == unichar::pattern_star_code());
//bool right_star_existence = (pattern[pattern.size() - 1] == unichar::pattern_star_code());
}
return std::map<int, int>();
//return complexInnerQuery(pattern);
}
//---------------------------------------------------------------------------------------------------//
Unistring prepareQueryPattern(std::string &original_pattern);
std::vector<Unistring> splitQueryPattern(Unistring &pattern);
std::pair<long, long> factorBounds(unichar ch, int used_prefix, uint32_t l, uint32_t r);
long searchLeft(unichar ch, int used_prefix, uint32_t l, uint32_t r);
long searchRight(unichar ch, int used_prefix, uint32_t l, uint32_t r);
std::pair<long, long> factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r);
std::set<uint32_t> simpleInnerQuery(Unistring &pattern);
std::map<int, int> simpleQueryMatching(
Unistring &simple_pattern,
std::map<int, int> prev_match,
bool left_inexact_match,
bool right_inexact_match);
std::map<int, int> query(std::string &pattern);