Unistring ConcatenatedUnistringContainer::prepareQueryPattern(std::string &pattern) {
auto pattern_modified = Unistring::decode(Utf8String((char *)pattern.c_str()));
for (uint32_t i = 0, len = pattern_modified.size(); i < len; i++) {
if (unichar::is_star_original(pattern_modified[i]))
pattern_modified[i] = unichar::pattern_star_code();
}
return pattern_modified;
}
std::vector<Unistring> ConcatenatedUnistringContainer::splitQueryPattern(Unistring &pattern) {
auto simple_patterns = std::vector<Unistring>();
auto cutted_pattern = pattern.slice();
while (cutted_pattern.size() > 0) {
int left_pattern_position = 0;
int right_pattern_position = cutted_pattern.size();
for (int i = 0, len = cutted_pattern.size(); i < len; i++) {
if (cutted_pattern[i] != unichar::pattern_star_code()) {
left_pattern_position = i;
break;
}
if (i == cutted_pattern.size() - 1)
return simple_patterns;
}
for (int i = cutted_pattern.size() - 1, len = left_pattern_position; i > len; i++) {
if (cutted_pattern[i] != unichar::pattern_star_code()) {
right_pattern_position = i;
break;
}
}
cutted_pattern = cutted_pattern.slice(left_pattern_position, right_pattern_position);
int star_idx = cutted_pattern.find(unichar::pattern_star_unichar());
if (star_idx == -1) {
simple_patterns.push_back(cutted_pattern.substr(0, cutted_pattern.size()));
return simple_patterns;
} else {
simple_patterns.push_back(cutted_pattern.substr(0, star_idx));
cutted_pattern.slice(star_idx + 1, cutted_pattern.size());
}
}
return simple_patterns;
}
std::pair<long, long> ConcatenatedUnistringContainer::factorBounds(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
long left_bound = -1;
long right_bound = -1;
for (uint32_t i = l, len = r; i <= len; i++) {
//std::cout << "looking for the left bound" << std::endl;
//std::cout << "ch: " << ch << ", cmp_ch: " << getUnichar(suffix_permutation[i] + used_prefix) << std::endl;
if (getUnichar(suffix_permutation[i] + used_prefix) == ch) {
left_bound = i;
//std::cout << "Then left bound is " << left_bound << std::endl;
break;
}
}
if (left_bound == -1) return std::pair<long, long>(-1, -1);
for (uint32_t i = left_bound, len = r; i <= len; i++) {
//std::cout << "looking for the right bound" << std::endl;
//std::cout << "ch: " << ch << ", cmp_ch: " << getUnichar(suffix_permutation[i] + used_prefix) << std::endl;
if (i == len || getUnichar(suffix_permutation[i + 1] + used_prefix) != ch) {
right_bound = i;
//std::cout << "Then right bound is " << right_bound << std::endl;
return std::pair<long, long>(left_bound, right_bound);
}
}
return std::pair<long, long>(-1, -1);
}
/*std::pair<long, long> ConcatenatedUnistringContainer::factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
uint32_t left_bound = l, high_bound = r;
//fix it with linear search on small size (i.e. < 20-50)
// get the start index of target number
long start_idx = -1;
long end_idx = -1;
if (r - l < 20) {
for (uint32_t i = l; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i > l) ? getUnichar(suffix_permutation[i - 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
start_idx = i;
break;
}
}
for (uint32_t i = start_idx; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i < r) ? getUnichar(suffix_permutation[i + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = i;
break;
}
}
} else {
while (left_bound <= high_bound) {
int mid = (high_bound - left_bound) / 2 + left_bound;
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid > l) ? getUnichar(suffix_permutation[mid-1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
start_idx = mid;
break;
} else if (comparer < ch) {
left_bound = mid + 1;
} else {
high_bound = mid - 1;
}
}
if (start_idx == r) return std::pair<long, long>(start_idx, start_idx);
// get the end index of target number
if (r - start_idx < 20) {
for (uint32_t i = start_idx; i <= r; i++) {
unichar comparer = getUnichar(suffix_permutation[i] + used_prefix);
unichar discarder = (i < r) ? getUnichar(suffix_permutation[i + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = i;
break;
}
}
} else {
left_bound = start_idx;
high_bound = r;
while (left_bound <= high_bound) {
int mid = (high_bound - left_bound) / 2 + left_bound;
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid < r) ? getUnichar(suffix_permutation[mid + 1] + used_prefix) : 0;
if (comparer == ch && ch != discarder) {
end_idx = mid;
break;
} else if (comparer > ch) {
high_bound = mid - 1;
} else {
left_bound = mid + 1;
}
}
}
}
if (start_idx != -1 && end_idx != -1){
return std::pair<long, long>(start_idx, end_idx);
}
return std::pair<long, long>(-1, -1);
}*/
long ConcatenatedUnistringContainer::searchLeft(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
if (l > r) return -1;
uint32_t mid = (r == l) ? r : (r - l)/2 + l;
if (used_prefix == suffix(suffix_permutation[mid]).size()) {
return searchLeft(ch, used_prefix, mid + 1, r);
}
if (used_prefix == suffix(suffix_permutation[mid - 1]).size()) {
return searchLeft(ch, used_prefix, mid, r);
}
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid > l) ? getUnichar(suffix_permutation[mid - 1] + used_prefix) : 0;
if (ch == comparer && ch != discarder) {
return mid;
} else if (comparer >= ch) {
return searchLeft(ch, used_prefix, l, mid - 1);
} else {
return searchLeft(ch, used_prefix, mid + 1, r);
}
}
long ConcatenatedUnistringContainer::searchRight(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
if (l > r) return -1;
uint32_t mid = (r == l) ? r : (r - l)/2 + l;
if (used_prefix == suffix(suffix_permutation[mid]).size()) {
return searchRight(ch, used_prefix, mid + 1, r);
}
if (used_prefix == suffix(suffix_permutation[mid + 1]).size()) {
return searchRight(ch, used_prefix, mid + 2, r);
}
unichar comparer = getUnichar(suffix_permutation[mid] + used_prefix);
unichar discarder = (mid < r) ? getUnichar(suffix_permutation[mid + 1] + used_prefix) : 0;
if (ch == comparer && ch != discarder) {
return mid;
} else if (comparer != ch) {
return searchLeft(ch, used_prefix, l, mid - 1);
} else {
return searchLeft(ch, used_prefix, mid + 1, r);
}
}
std::pair<long, long> ConcatenatedUnistringContainer::factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r) {
long start_idx = searchLeft(ch, used_prefix, l, r);
if (start_idx == -1) return std::pair<long, long>(-1, -1);
if (start_idx == r) return std::pair<long, long>(start_idx, start_idx);
return std::pair<long, long>(start_idx, searchRight(ch, used_prefix, start_idx, r));
}
std::map<int, int> ConcatenatedUnistringContainer::simpleQueryMatching(
Unistring &simple_pattern,
std::map<int, int> prev_match,
bool left_inexact_match,
bool right_inexact_match) {
auto selection = std::map<int, int>();
QueryTask task = QueryTask(0, suffix_permutation.size() - 1, 0);
while (task.left_bound <= task.right_bound) {
auto bounds = factorBoundsBin(
simple_pattern[task.used_prefix],
task.used_prefix,
task.left_bound,
task.right_bound);
std::cout << "u_p: " << task.used_prefix << "l: " << task.left_bound << "r: " << task.right_bound << std::endl;
if (bounds.first == -1 || bounds.second == -1) {
std::cout << "Sorry, pattern was not found. 1" << std::endl;
return selection;
}
//auto first_matched_suffix = suffix(suffix_permutation[bounds.first]);
auto first_matched_suffix = suffix(suffix_permutation[bounds.second]);
if (first_matched_suffix.size() >= simple_pattern.size()) {
for (uint32_t i = bounds.second - 1; i >= bounds.first; i--) {
auto next_suffix = suffix(suffix_permutation[i]);
if (next_suffix.size() < simple_pattern.size()) {
bounds.first = i + 1;
//lcp = LCP(bounds.first, bounds.second);
//lcp = first_matched_suffix.size();
break;
}
first_matched_suffix = next_suffix;
}
}
auto lcp = LCP(bounds.first, bounds.second);
bool right_match_condition = (right_inexact_match)
? (lcp >= simple_pattern.size())
: (lcp == simple_pattern.size());
bool right_compare_condition = (right_match_condition)
? (simple_pattern.slice(task.used_prefix, 0) == first_matched_suffix.slice(task.used_prefix, simple_pattern.size()))
: (simple_pattern.slice(task.used_prefix, 0) == first_matched_suffix.slice(task.used_prefix));
if (right_match_condition && right_compare_condition) {
for (uint32_t i = bounds.first; i <= bounds.second; i++) {
auto current_suffix = suffix(suffix_permutation[i]);
bool grab_this_suffix = (right_inexact_match)
? true
: current_suffix.size() == simple_pattern.size();
if (!grab_this_suffix) break;
auto current_string_number = find_string_index(current_suffix.start_idx);
bool is_not_overlap = (prev_match.count(current_string_number)
&& prev_match[current_string_number] >= current_suffix.start_idx + simple_pattern.size());
std::cout << "p_m[" << current_string_number << "] == " << prev_match[current_string_number] << std::endl;
bool left_match_condition = (left_inexact_match)
? current_suffix.start_idx >= strings[current_string_number].start_idx
: current_suffix.start_idx == strings[current_string_number].start_idx;
if (is_not_overlap && left_match_condition) {
uint32_t max_start_pos = (selection.count(current_string_number))
? std::min(static_cast<int>(selection[current_string_number]), static_cast<int>(current_suffix.start_idx))
: current_suffix.start_idx;
selection.emplace(current_string_number, max_start_pos);
}
}
if (selection.empty())
std::cout << "Sorry, pattern was not found. Selection empty? - " << selection.empty() << std::endl;
return selection;
}
task.left_bound = bounds.first;
task.right_bound = bounds.second;
task.used_prefix = lcp;
}
std::cout << "Sorry, pattern was not found in while cycle." << std::endl;
return selection;
}
std::set<uint32_t> ConcatenatedUnistringContainer::simpleInnerQuery(Unistring &simple_pattern) {
//std::cout << "Entrance to simple inner query" << std::endl;
auto selection = std::set<uint32_t>();
QueryTask task = QueryTask(0, suffix_permutation.size() - 1, 0);
//std::cout << task << std::endl;
while(task.left_bound <= task.right_bound) {
//std::cout << task << std::endl;
//auto bounds = factorBounds(simple_pattern[task.used_prefix], task.used_prefix, task.left_bound, task.right_bound);
auto bounds = factorBoundsBin(simple_pattern[task.used_prefix], task.used_prefix, task.left_bound, task.right_bound);
//assert(bounds.first == clone_bounds.first);
//assert(bounds.second == clone_bounds.second);
//std::cout << "Current bounds: [" << bounds.first << ", " << bounds.second << "]" << std::endl;
if (bounds.first == -1 || bounds.second == -1) {
std::cout << "Sorry, pattern was not found." << std::endl;
return std::set<uint32_t>();
}
auto lcp = LCP(bounds.first, bounds.second);
if (bounds.first == bounds.second && lcp == simple_pattern.size() + 1) {
auto lone_matched_suffix = suffix(suffix_permutation[bounds.first]);
if (simple_pattern.slice(task.used_prefix, -1) ==
lone_matched_suffix.slice(task.used_prefix, lone_matched_suffix.size() - 1)) {
selection.insert(find_string_index(suffix_permutation[bounds.first]));
return selection;
}
}
if (simple_pattern.size() <= lcp) {
auto first_mathing_suffix = suffix(suffix_permutation[bounds.first]);
if (simple_pattern.slice(task.used_prefix, -1) ==
first_mathing_suffix.slice(task.used_prefix, simple_pattern.size())) {
for (uint32_t i = bounds.first; i <= bounds.second; i++) {
selection.insert(find_string_index(suffix_permutation[i]));
}
return selection;
}
}
task.left_bound = bounds.first;
task.right_bound = bounds.second;
task.used_prefix = lcp;
}
std::cout << "Sorry, I'm looking into all of my words. There haven't any results for you :(" << std::endl;
return selection;
}
std::map<int, int> ConcatenatedUnistringContainer::query(std::string &original_pattern) {
//std::cout << "Entrance to the query" << std::endl;
Unistring pattern = prepareQueryPattern(original_pattern);
//auto splitted = splitQueryPattern(pattern);
//for (int i = 0, len = splitted.size(); i < len; i++) {
// std::cout << splitted[i] << std::endl;
//}
//std::cout << "Making following pattern: \"" << pattern << "\"" << std::endl;
if (!pattern.contains_unichar(unichar::pattern_star_code()))
return simpleQueryMatching(pattern, first_result, false, false);
else {
//bool left_star_existence = (pattern[0] == unichar::pattern_star_code());
//bool right_star_existence = (pattern[pattern.size() - 1] == unichar::pattern_star_code());
}
return std::map<int, int>();
//return complexInnerQuery(pattern);
}
//---------------------------------------------------------------------------------------------------//
Unistring prepareQueryPattern(std::string &original_pattern);
std::vector<Unistring> splitQueryPattern(Unistring &pattern);
std::pair<long, long> factorBounds(unichar ch, int used_prefix, uint32_t l, uint32_t r);
long searchLeft(unichar ch, int used_prefix, uint32_t l, uint32_t r);
long searchRight(unichar ch, int used_prefix, uint32_t l, uint32_t r);
std::pair<long, long> factorBoundsBin(unichar ch, int used_prefix, uint32_t l, uint32_t r);
std::set<uint32_t> simpleInnerQuery(Unistring &pattern);
std::map<int, int> simpleQueryMatching(
Unistring &simple_pattern,
std::map<int, int> prev_match,
bool left_inexact_match,
bool right_inexact_match);
std::map<int, int> query(std::string &pattern);