Проверка орфографии на основе биграмм

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
#include <iostream>
#include <vector>
#include <iterator>
#include <string>
#include <fstream>
using namespace std;
vector<string> w2t(const string &in)
{
vector<string> out;
string token;
unsigned int i, l;
if (in.length() != 1)
for (i = 0, l = 1; i < in.length(); i++, l++) {
if (in[i] >= 'a' && in[i] <= 'z')
token += in[i];
if (l == 2){
out.push_back(token);
token.clear();
l = 0; i--;
}
}
else out.push_back(in);
return out;
}
double mcompare(vector<string> x,vector<string> y){
int i, j, count=0;
for (i = 0; i < x.size(); i++){
for (j = 0; j < y.size(); j++){
if (x[i] == y[j]){
count++;
y[j] = "";
break;
}
}
}
return (double)count / (double)(x.size() + y.size() - count);
}
int main()
{
istreambuf_iterator<char> eos;
istreambuf_iterator<char> iit(std::cin.rdbuf());
vector<string> mystring, wbg;
vector <vector < string >> bigrams;
vector <double> csim;
vector <int> maxf;
string word;
int i, freq, aow;
double sim;
for (i = 0; iit != eos; i++){
while (*iit != '\n'&&iit != eos)word += *iit++;
mystring.push_back(word);
*iit++;
bigrams.push_back(w2t(mystring[i]));
csim.push_back(0);
maxf.push_back(0);
word.clear();
}
aow = mystring.size();
ifstream myfile;
myfile.open("count_big.txt");
while (myfile >> word >> freq)
{
wbg=w2t(word);
for (i = 0; i < aow; i++){
sim = mcompare(bigrams[i], wbg);
if (sim>csim[i]){
csim[i] = sim;
mystring[i] = word;
maxf[i] = freq;
}
else if (sim == csim[i] && sim != 0 && freq > maxf[i]){
mystring[i] = word;
maxf[i] = freq;
}
}
}
myfile.close();
for (i = 0; i < mystring.size(); i++){
cout << mystring[i] << endl;
}
getchar(); getchar(); getchar(); getchar(); getchar();
return 0;
}