Lolly 1.4.27
Loading...
Searching...
No Matches
fast_search.cpp
Go to the documentation of this file.
1
2/******************************************************************************
3 * MODULE : fast_search.cpp
4 * DESCRIPTION: Fast multiple searches in same string
5 * COPYRIGHT : (C) 2014 Joris van der Hoeven
6 *******************************************************************************
7 * This software falls under the GNU general public license version 3 or later.
8 * It comes WITHOUT ANY WARRANTY WHATSOEVER. For details, see the file LICENSE
9 * in the root directory or <http://www.gnu.org/licenses/gpl-3.0.html>.
10 ******************************************************************************/
11
12#include "fast_search.hpp"
13#include "analyze.hpp"
14#include "iterator.hpp"
15
16/******************************************************************************
17 * Subroutines
18 ******************************************************************************/
19
20static int
21hash_combine (int c1, int c2, int l) {
22 // FIXME: we should rather compute modulo 2^32 - 1 instead of 2^32,
23 // or modulo some prime number < 2^32, and avoid the cyclicity of period 32
24 int rot= (9 * l) & 31;
25 if (rot == 0) return c1 ^ c2;
26 unsigned int u1= c1;
27 unsigned int u2= c2;
28 unsigned int r2=
29 ((u2 << rot) & 0xffffffff) ^ ((u2 >> (32 - rot)) & 0xffffffff);
30 return (int) (u1 ^ r2);
32
33static int
34fast_hash (string s) {
35 unsigned int h= 0;
36 for (int i= N (s) - 1; i >= 0; i--) {
37 h= ((h << 9) & 0xffffffff) ^ ((h >> 23) & 0xffffffff);
38 h= h ^ ((unsigned int) (unsigned char) s[i]);
39 }
40 return (int) h;
41}
42
43/******************************************************************************
44 * Construct search engine
45 ******************************************************************************/
46
48 // NOTE: a[i] contains a hashmap with all associations c :-> j,
49 // where c is the hash code of the substring s (j, j + 2^i).
50 int i, n= N (s);
51 array<int> codes (n);
52 for (i= 0; i < n; i++)
53 codes[i]= (int) (unsigned int) (unsigned char) s[i];
54 int l= 1;
55 while (l <= n) {
57 for (i= 0; i + l <= n; i++) {
58 int c= codes[i];
59 if (!h->contains (c)) h (c)= array<int> ();
60 h (c) << i;
61 }
62 a << h;
63 int d= 2 * l;
64 for (i= 0; i + d <= n; i++)
65 codes[i]= hash_combine (codes[i], codes[i + l], l);
66 l= d;
67 }
68}
69
70string
74
75/******************************************************************************
76 * Search
77 ******************************************************************************/
78
81 if (N (what) == 0) {
83 for (int i= 0; i <= N (s); i++)
84 r << i;
85 return r;
86 }
87 int k= 1, l= 0;
88 while ((k << 1) <= N (what)) {
89 k<<= 1;
90 l++;
91 }
92 int code= fast_hash (what (0, k));
93 if (!a[l]->contains (code)) return array<int> ();
94 else return a[l][code];
95}
96
97int
100 for (int i= 0; i < N (ps); i++) {
101 int next= ps[i];
102 if (next >= pos && test (s, next, what)) return next;
103 }
104 return -1;
105}
106
109 // NOTE: There are better algorithms based on generalized suffix trees.
110 // However, in practice, our approach based on hashtables might be faster
113 for (int i= 0; i < N (ps); i++) {
114 int pos= ps[i];
115 if (test (s, pos, what)) r << pos;
116 }
117 return r;
118}
119
120/******************************************************************************
121 * Get (leftmost) longest common substring
122 ******************************************************************************/
123
124void
125get_longest_common (string s1, string s2, int& b1, int& e1, int& b2, int& e2) {
126 // NOTE: There are better algorithms based on generalized suffix trees.
127 b1= e1= b2= e2 = 0;
128 int bestl= 0;
131 for (int i= min (N (ss1->a), N (ss2->a)) - 1; i >= 0; i--) {
135 while (it->busy ()) {
136 int h = it->next ();
137 array<int> ps1= a1[h];
138 array<int> ps2= a2[h];
139 if (N (ps1) == 0 || N (ps2) == 0) continue;
140 for (int j1= 0; j1 < N (ps1); j1++) {
141 int pos1= ps1[j1];
142 if (b1 <= pos1 && pos1 < e1) continue;
143 for (int j2= 0; j2 < N (ps2); j2++) {
144 int pos2= ps2[j2];
145 int bb1= pos1, ee1= pos1, bb2= pos2, ee2= pos2;
146 while (bb1 > 0 && bb2 > 0 && s1[bb1 - 1] == s2[bb2 - 1]) {
147 bb1--;
148 bb2--;
149 }
150 while (ee1 < N (s1) && ee2 < N (s2) && s1[ee1] == s2[ee2]) {
151 ee1++;
152 ee2++;
153 }
154 if (ee1 - bb1 > bestl ||
155 (ee1 - bb1 == bestl && (bb1 < b1 || (bb1 == b1 && bb2 < b2)))) {
156 b1 = bb1;
157 e1 = ee1;
158 b2 = bb2;
159 e2 = ee2;
160 bestl= e1 - b1;
161 }
162 }
163 }
164 }
165 }
166}
bool test(string s, int i, const char *test)
Definition analyze.cpp:542
int N(array< T > a)
Get the length of the array.
Definition array.hpp:170
bool contains(T a, array< T > b)
Check if an array contains a specified element.
blackbox b1
blackbox b2
The list class represents a linked list.
Definition list.hpp:48
int search_next(string what, int pos)
array< int > search_all(string what)
array< int > search_sub(string what)
array< hashmap< int, array< int > > > a
string_searcher_rep(string s)
void get_longest_common(string s1, string s2, int &b1, int &e1, int &b2, int &e2)
static int hash_combine(int c1, int c2, int l)
static int fast_hash(string s)
iterator< T > iterate(hashmap< T, U > h)
Generates an iterator for a container of type hashmap<T, U>.
Definition iterator.ipp:166
SI min(SI i, SI j)
Returns the minimum of two signed integers.
Definition minmax.hpp:27