// code_or_text.cc // (C) 2008 Philip Endecott // See http://chezphil.org/restless/ // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #include "code_or_text.hh" #include #include #include #include #include #include using namespace std; // Character frequency analysis // ---------------------------- typedef tr1::array char_freq_t; // Typical character frequency table for code and text come from these files: char_freq_t code_freq = {{ #include "codefreqs.h" }}; char_freq_t text_freq = {{ #include "textfreqs.h" }}; // Determine the correlation between two character frequency distributions: // This algorithm comes from http://en.wikipedia.org/wiki/Correlation (bottom of page); // its behaviour is non-obvious in that it's measuring the deviation from a "wrong" // mean as it goes along; however I've compared it with an "obvious" 2-pass implementation // and it does seem to get the same answer. static float correlation(const char_freq_t& x, const char_freq_t& y) { float sum_sq_x = 0; float sum_sq_y = 0; float sum_coproduct = 0; float mean_x = x[0]; float mean_y = y[0]; for (int i=1; i<256; ++i) { float sweep = static_cast(i)/(i+1); float delta_x = x[i] - mean_x; float delta_y = y[i] - mean_y; sum_sq_x += delta_x * delta_x * sweep; sum_sq_y += delta_y * delta_y * sweep; sum_coproduct += delta_x * delta_y * sweep; mean_x += delta_x / (i+1); mean_y += delta_y / (i+1); } float pop_sd_x = sqrt(sum_sq_x/256); float pop_sd_y = sqrt(sum_sq_y/256); float cov_x_y = sum_coproduct / 256; return cov_x_y / (pop_sd_x * pop_sd_y); } static bool has_codelike_charfreq(const para_t& para) { // Measure the character frequency distribution for this paragraph: char_freq_t freq = {{0}}; for (para_t::const_iterator i = para.begin(); i != para.end(); ++i) { const string& l = *i; for (string::const_iterator j = l.begin(); j != l.end(); ++j) { freq[*j] += 1; } } // Measure the correlation with the two reference distributions: float code_cor = correlation(freq,code_freq); float text_cor = correlation(freq,text_freq); // Which is it more like? return code_cor > text_cor; } // If any line is indented, it's code. (Doesn't cope with tabs.) static bool has_codelike_indentation(const para_t& para) { for (para_t::const_iterator i = para.begin(); i != para.end(); ++i) { const string& l = *i; if (!l.empty() && l[0]==' ') { return true; } } return false; } // How jagged is the right margin? static bool has_codelike_rightmargin(const para_t& para) { // We ignore the last line as it's typically shorter even for text. // If there is only one line, we can't say anything. if (para.size()==1) { return false; } int sum_length = 0; for (size_t n=0; n(sum_length)/(para.size()-1); // A very short mean means its code if (mean_length<35) { return true; } float sum_squares = 0; for (size_t n=0; n 0.1; } bool is_code(const para_t& para) { return has_codelike_indentation(para) || has_codelike_rightmargin(para) || has_codelike_charfreq(para); }