// src/markup.cc // This file is part of libpbe; see http://decimail.org // (C) 2004 Philip Endecott // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #include "markup.hh" #include "StringTransformer.hh" #include #include class AtDotToWords: public StringTransformer { public: AtDotToWords() { add_cs_rule('@'," AT "); add_cs_rule('.'," DOT "); } }; static AtDotToWords at_dot_to_words; static string render_email(string e) { string h = ""+at_dot_to_words(e)+""; return h; } static string render_uri(string u) { string uri; if (u.substr(0,4)!="http") { uri = "http://" + u; } else { uri=u; } string h = ""; h += u; h += ""; return h; } // See RFC2396 for URI syntax. // We make the protocol prefix optional, so www.foo.com works. // We require that the last component of the domain name is purely // alphabetic and has at least two characters. This is true in // practice (.com, .uk) and helps to avoid false positives ("e.g."). // We require that the last character of a path segment is not a // common punctuation character: .!):;, This is because // these rarely fall at the end of a true URI but are often placed // after a URI when it is written in text. // We apply this after escaping & to &. This is only an issue in // the path where ; normally has a special meaning which we have to // ignore. // We reuse the URI domain rules for emails. // For email local parts we use the list of characters allowed by // RFC2822, but allow adjacent .s. // We disallow @ in URIs so that we can use it to distinguish emails // from URIs. They should be allowed in paths, queries and // fragment-ids. const char* uri_regexp = "(https?://)?" // optional protocol "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+" // hostname "(:[0-9]+)?" // optional port number "(/[-a-zA-Z0-9_.!~*'()%:&;=+$,]*[-a-zA-Z0-9_~*'(%&=+$])*" // path, may be empty "/?" // path to directory may be terminated with a / "(\\?[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?" // optional queery "(#[-;/?:&=+$,a-zA-Z0-9_.!~*'()]*)?" // optional fragment-id ; const char* email_regexp = "[a-zA-Z0-9!#$%&'*+-/=?^_`{|}~.]+" // local-part "@" "[-a-zA-Z0-9]+(\\.[-a-zA-Z0-9]+)*\\.[a-zA-Z][a-zA-Z]+" // hostname ; void markup_uris_emails ( string& text ) { static bool re_compiled = false; static regex_t re; if (!re_compiled) { string regexp = string("(") + uri_regexp + ")|(" + email_regexp +")"; int rc = regcomp(&re, regexp.c_str(), REG_EXTENDED); assert(rc==0); re_compiled=true; } string r; regmatch_t match; int pos=0; while (1) { int rc = regexec(&re, text.c_str()+pos, 1, &match, 0); assert((rc==0) || (rc==REG_NOMATCH)); if (rc==REG_NOMATCH) { r.append(text.substr(pos)); break; } r += text.substr(pos,match.rm_so); string p = text.substr(pos+match.rm_so,match.rm_eo-match.rm_so); if (p.find('@')!=p.npos) { r += render_email(p); } else { r += render_uri(p); } pos += match.rm_eo; } text = r; }