ProcImap.Utils.Processing

1 ############################################################################ 2 # Copyright (C) 2008 by Michael Goerz # 3 # http://www.physik.fu-berlin.de/~goerz # 4 # # 5 # This program is free software; you can redistribute it and#or modify # 6 # it under the terms of the GNU General Public License as published by # 7 # the Free Software Foundation; either version 3 of the License, or # 8 # (at your option) any later version. # 9 # # 10 # This program is distributed in the hope that it will be useful, # 11 # but WITHOUT ANY WARRANTY; without even the implied warranty of # 12 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # 13 # GNU General Public License for more details. # 14 # # 15 # You should have received a copy of the GNU General Public License # 16 # along with this program; if not, write to the # 17 # Free Software Foundation, Inc., # 18 # 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # 19 ############################################################################ 20 21 """ This module contains general functions for processing mail locally 22 e.g. for filtering and classification. 23 """ 24 25 import re 26 import subprocess 27 import tempfile 28 import os 29 from email.generator import Generator 30 from cStringIO import StringIO 31 32 from ProcImap.ImapMessage import ImapMessage 33

34 -class AddressListFile:

35 """ This class wraps around a file containing emailadresses. 36 It is intended to help with Whitelisting, Blacklisting, etc. 37 """

38 - def __init__(self, filename, inmemory=False, regexes=False):

39 """ Initialize AddressListFile: 40 If inmemory is True, the file is loaded into memory. 41 If regexes is True, the lines in the file are compiled 42 as regexes. 43 """ 44 self._cache = {} 45 self._data = [] 46 self.filename = filename 47 self._inmemory = inmemory 48 self._use_regexes = regexes 49 if self._inmemory: 50 if self._use_regexes: 51 infile = open(filename) 52 for line in infile: 53 self._data.append(re.compile(line.strip())) 54 infile.close() 55 else: 56 infile = open(filename) 57 self._data = infile.read().split("\n") 58 self._data = [x for x in self._data if x != ''] 59 infile.close()

60 - def contains(self, lookupstring):

61 """ Return True if there is a line in the represented file that is 62 contained in lookupstring. E.g., if you search for 63 'someone@gmail.com' and the file contains a line '@gmail.com', 64 True is returned. 65 If regexes are used, return True if there is a regex in the file 66 that matches the lookupstring completely. 67 """ 68 if lookupstring is None: 69 return False 70 if self._cache.has_key(lookupstring): 71 return self._cache[lookupstring] 72 if self._use_regexes: 73 if self._inmemory: 74 for regex in self._data: 75 if regex.match(lookupstring): 76 self._cache[lookupstring] = True 77 return True 78 else: 79 infile = open(self.filename) 80 for line in infile: 81 regex = re.compile(line.strip()) 82 if regex.match(lookupstring): 83 self._cache[lookupstring] = True 84 return True 85 infile.close() 86 else: 87 if self._inmemory: 88 for in_file_string in self._data: 89 if in_file_string in lookupstring: 90 self._cache[lookupstring] = True 91 return True 92 else: 93 infile = open(self.filename) 94 for line in infile: 95 line = line.strip() 96 if line in lookupstring: 97 self._cache[lookupstring] = True 98 return True 99 infile.close() 100 self._cache[lookupstring] = False 101 return False

102 - def add(self, line):

103 """ Add line to self.filename """ 104 outfile = open(self.filename, "a") 105 outfile.write(line) 106 if not line[-1] == "\n": 107 outfile.write("\n") 108 outfile.close()

109 110 111

112 -class ReplacementListFile:

113 """ This class wraps around a file containing email address 114 replacements. 115 The text file contains lines such as 116 noreply@couchsurfing.com :: Couchsurfing <noreply@couchsurfing.com> 117 The intention is to to replace 'noreply@couchsurfing.com' 118 with 'Couchsurfing <noreply@couchsurfing.com>'. 119 You can use this to make the from-line look nice in your email 120 reader, if people send you crippled from-lines. 121 """

122 - def __init__(self, filename, inmemory=False, regexes=False, partial=False):

123 self._cache = {} 124 self._data = None 125 if regexes: 126 self._data = [] 127 else: 128 self._data = {} # dicts will only work for non-regexes 129 self.filename = filename 130 self._inmemory = inmemory 131 self._use_regexes = regexes 132 self._partial = partial 133 if self._inmemory: 134 if self._use_regexes: 135 infile = open(filename) 136 for line in infile: 137 (original, replacement) = line.split("::", 1) 138 original = re.compile(original.strip()) 139 replacement = replacement.strip() 140 self._data.append((original, replacement)) 141 infile.close() 142 else: 143 infile = open(filename) 144 for line in infile: 145 (original, replacement) = line.split("::", 1) 146 original = original.strip() 147 replacement = replacement.strip() 148 self._data[original] = replacement 149 infile.close()

150 - def lookup(self, searchstring):

151 """ Return a replacement. If no replacement is found, return 152 the searchstring. 153 """ 154 if searchstring is None: 155 return None 156 if self._cache.has_key(searchstring): 157 return self._cache[searchstring] 158 if self._use_regexes: 159 if self._inmemory: 160 for (regex, replacement) in self._data: 161 if regex.match(searchstring): 162 self._cache[searchstring] = replacement 163 return replacement 164 else: 165 infile = open(self.filename) 166 for line in infile: 167 (original, replacement) = line.split("::", 1) 168 original = re.compile(original.strip()) 169 replacement = replacement.strip() 170 regex = re.compile(line[:-1]) 171 if regex.match(searchstring): 172 self._cache[searchstring] = replacement 173 return replacement 174 infile.close() 175 else: 176 if self._inmemory: 177 if self._data.has_key(searchstring): 178 replacement = self._data[searchstring] 179 self._cache[searchstring] = replacement 180 return replacement 181 else: 182 if self._partial: 183 for (original, replacement) in self._data.items(): 184 if original in searchstring: 185 self._cache[searchstring] = replacement 186 return replacement 187 else: 188 return searchstring 189 else: 190 infile = open(self.filename) 191 for line in infile: 192 (original, replacement) = line.split("::", 1) 193 original = original.strip() 194 replacement = replacement.strip() 195 if original in searchstring: 196 self._cache[searchstring] = replacement 197 return replacement 198 infile.close() 199 self._cache[searchstring] = searchstring 200 return searchstring

201 - def add(self, line):

202 """ Add line to self.filename """ 203 outfile = open(self.filename, "a") 204 outfile.write(line) 205 if not line[-1] == "\n": 206 outfile.write("\n") 207 outfile.close()

208 209 210

211 -def pipe_message(message, command):

212 """ Pipe the message through a shell command: 213 cat message | commmand > message 214 message is assumed to be an instance of ImapMessage 215 Returns modified message as instance of ImapMessage 216 """ 217 p = subprocess.Popen([command], shell=True, 218 stdin=subprocess.PIPE, stdout=subprocess.PIPE, close_fds=True) 219 (child_stdout, child_stdin) = (p.stdout, p.stdin) 220 221 memoryfile = StringIO() 222 generator = Generator(memoryfile, mangle_from_=False, maxheaderlen=60) 223 generator.flatten(message) 224 child_stdin.write(memoryfile.getvalue()) 225 child_stdin.close() 226 modified_message = ImapMessage(child_stdout) 227 child_stdout.close() 228 modified_message.set_imapflags(message.get_imapflags()) 229 modified_message.internaldate = message.internaldate 230 if hasattr(message, 'myflags'): 231 modified_message.myflags = message.myflags 232 if hasattr(message, 'mailbox'): 233 modified_message.mailbox = message.mailbox 234 return modified_message

235

236 -def unknown_to_ascii(inputstring):

237 """ This takes a string or unicode string in unknown encoding, tries to 238 guess the encoding and to replace Latin-1 characters with something 239 equivalent in 7-bit ASCII. Decoding an unknown string is based on 240 heuristics. This function may return complete garbage. 241 The function returns a plain ASCII string, making a best effort to 242 convert Latin-1 characters into ASCII equivalents. It does not just 243 strip out the Latin-1 characters. All characters in the standard 7-bit 244 ASCII range are preserved. In the 8th bit range all the Latin-1 245 accented letters are converted to unaccented equivalents. Most symbol 246 characters are converted to something meaningful. Anything not 247 converted is deleted. 248 249 Adapted from 250 http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/251871 251 """ 252 xlate = { 253 # unicode string : (replacement, weight) 254 u'\N{ACUTE ACCENT}' : ( "", 0), 255 u'\N{BROKEN BAR}' : ( '|', 0), 256 u'\N{CEDILLA}' : ( '', 0), 257 u'\N{CENT SIGN}' : ( ' cent', 0), 258 u'\N{COPYRIGHT SIGN}' : ( '(c)', 1), 259 u'\N{CURRENCY SIGN}' : ( '', 0), 260 u'\N{DEGREE SIGN}' : ( '', 1), 261 u'\N{DIAERESIS}' : ( '', 0), 262 u'\N{DIVISION SIGN}' : ( '/', 1), 263 u'\N{FEMININE ORDINAL INDICATOR}' : ( '', 0), 264 u'\N{INVERTED EXCLAMATION MARK}' : ( '!', 1), 265 u'\N{INVERTED QUESTION MARK}' : ( '?', 1), 266 u'\N{LATIN CAPITAL LETTER A WITH ACUTE}' : ( 'A', 1), 267 u'\N{LATIN CAPITAL LETTER A WITH CIRCUMFLEX}' : ( 'A', 1), 268 u'\N{LATIN CAPITAL LETTER A WITH DIAERESIS}' : ( 'Ae', 1), 269 u'\N{LATIN CAPITAL LETTER A WITH GRAVE}' : ( 'A', 1), 270 u'\N{LATIN CAPITAL LETTER A WITH RING ABOVE}' : ( 'A', 1), 271 u'\N{LATIN CAPITAL LETTER A WITH TILDE}' : ( 'A', 1), 272 u'\N{LATIN CAPITAL LETTER AE}' : ( 'Ae', 2), 273 u'\N{LATIN CAPITAL LETTER C WITH CEDILLA}' : ( 'C', 1), 274 u'\N{LATIN CAPITAL LETTER E WITH ACUTE}' : ( 'E', 1), 275 u'\N{LATIN CAPITAL LETTER E WITH CIRCUMFLEX}' : ( 'E', 1), 276 u'\N{LATIN CAPITAL LETTER E WITH DIAERESIS}' : ( 'E', 1), 277 u'\N{LATIN CAPITAL LETTER E WITH GRAVE}' : ( 'E', 1), 278 u'\N{LATIN CAPITAL LETTER ETH}' : ( 'Th', 1), 279 u'\N{LATIN CAPITAL LETTER I WITH ACUTE}' : ( 'I', 1), 280 u'\N{LATIN CAPITAL LETTER I WITH CIRCUMFLEX}' : ( 'I', 1), 281 u'\N{LATIN CAPITAL LETTER I WITH DIAERESIS}' : ( 'I', 1), 282 u'\N{LATIN CAPITAL LETTER I WITH GRAVE}' : ( 'I', 1), 283 u'\N{LATIN CAPITAL LETTER N WITH TILDE}' : ( 'N', 1), 284 u'\N{LATIN CAPITAL LETTER O WITH ACUTE}' : ( 'O', 1), 285 u'\N{LATIN CAPITAL LETTER O WITH CIRCUMFLEX}' : ( 'O', 1), 286 u'\N{LATIN CAPITAL LETTER O WITH DIAERESIS}' : ( 'Oe', 2), 287 u'\N{LATIN CAPITAL LETTER O WITH GRAVE}' : ( 'O', 1), 288 u'\N{LATIN CAPITAL LETTER O WITH STROKE}' : ( 'O', 1), 289 u'\N{LATIN CAPITAL LETTER O WITH TILDE}' : ( 'O', 1), 290 u'\N{LATIN CAPITAL LETTER THORN}' : ( 'th', 1), 291 u'\N{LATIN CAPITAL LETTER U WITH ACUTE}' : ( 'U', 1), 292 u'\N{LATIN CAPITAL LETTER U WITH CIRCUMFLEX}' : ( 'U', 1), 293 u'\N{LATIN CAPITAL LETTER U WITH DIAERESIS}' : ( 'Ue', 2), 294 u'\N{LATIN CAPITAL LETTER U WITH GRAVE}' : ( 'U', 1), 295 u'\N{LATIN CAPITAL LETTER Y WITH ACUTE}' : ( 'Y', 1), 296 u'\N{LATIN SMALL LETTER A WITH ACUTE}' : ( 'a', 1), 297 u'\N{LATIN SMALL LETTER A WITH CIRCUMFLEX}' : ( 'a', 1), 298 u'\N{LATIN SMALL LETTER A WITH DIAERESIS}' : ( 'ae', 2), 299 u'\N{LATIN SMALL LETTER A WITH GRAVE}' : ( 'a', 1), 300 u'\N{LATIN SMALL LETTER A WITH RING ABOVE}' : ( 'a', 1), 301 u'\N{LATIN SMALL LETTER A WITH TILDE}' : ( 'a', 1), 302 u'\N{LATIN SMALL LETTER AE}' : ( 'ae', 3), 303 u'\N{LATIN SMALL LETTER C WITH CEDILLA}' : ( 'c', 1), 304 u'\N{LATIN SMALL LETTER E WITH ACUTE}' : ( 'e', 1), 305 u'\N{LATIN SMALL LETTER E WITH CIRCUMFLEX}' : ( 'e', 1), 306 u'\N{LATIN SMALL LETTER E WITH DIAERESIS}' : ( 'e', 1), 307 u'\N{LATIN SMALL LETTER E WITH GRAVE}' : ( 'e', 1), 308 u'\N{LATIN SMALL LETTER ETH}' : ( 'th', 1), 309 u'\N{LATIN SMALL LETTER I WITH ACUTE}' : ( 'i', 1), 310 u'\N{LATIN SMALL LETTER I WITH CIRCUMFLEX}' : ( 'i', 1), 311 u'\N{LATIN SMALL LETTER I WITH DIAERESIS}' : ( 'i', 1), 312 u'\N{LATIN SMALL LETTER I WITH GRAVE}' : ( 'i', 1), 313 u'\N{LATIN SMALL LETTER N WITH TILDE}' : ( 'n', 1), 314 u'\N{LATIN SMALL LETTER O WITH ACUTE}' : ( 'o', 1), 315 u'\N{LATIN SMALL LETTER O WITH CIRCUMFLEX}' : ( 'o', 1), 316 u'\N{LATIN SMALL LETTER O WITH DIAERESIS}' : ( 'oe', 2), 317 u'\N{LATIN SMALL LETTER O WITH GRAVE}' : ( 'o', 1), 318 u'\N{LATIN SMALL LETTER O WITH STROKE}' : ( 'o', 1), 319 u'\N{LATIN SMALL LETTER O WITH TILDE}' : ( 'o', 1), 320 u'\N{LATIN SMALL LETTER SHARP S}' : ( 'ss', 2), 321 u'\N{LATIN SMALL LETTER THORN}' : ( 'th', 0), 322 u'\N{LATIN SMALL LETTER U WITH ACUTE}' : ( 'u', 1), 323 u'\N{LATIN SMALL LETTER U WITH CIRCUMFLEX}' : ( 'u', 1), 324 u'\N{LATIN SMALL LETTER U WITH DIAERESIS}' : ( 'ue', 2), 325 u'\N{LATIN SMALL LETTER U WITH GRAVE}' : ( 'u', 1), 326 u'\N{LATIN SMALL LETTER Y WITH ACUTE}' : ( 'y', 1), 327 u'\N{LATIN SMALL LETTER Y WITH DIAERESIS}' : ( 'y', 1), 328 u'\N{LEFT-POINTING DOUBLE ANGLE QUOTATION MARK}' : ( '"', 0), 329 u'\N{MACRON}' : ( '', 0), 330 u'\N{MASCULINE ORDINAL INDICATOR}' : ( '', 0), 331 u'\N{MICRO SIGN}' : ( 'micro', 0), 332 u'\N{MIDDLE DOT}' : ( '*', 0), 333 u'\N{MULTIPLICATION SIGN}' : ( '*', 0), 334 u'\N{NOT SIGN}' : ( 'not', 0), 335 u'\N{PILCROW SIGN}' : ( '', 0), 336 u'\N{PLUS-MINUS SIGN}' : ( '+/-', 0), 337 u'\N{POUND SIGN}' : ( ' pound', 0), 338 u'\N{REGISTERED SIGN}' : ( '(R)', 0), 339 u'\N{RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK}' : ( '"', 0), 340 u'\N{SECTION SIGN}' : ( '', 0), 341 u'\N{SOFT HYPHEN}' : ( '-', 0), 342 u'\N{SUPERSCRIPT ONE}' : ( '1', 0), 343 u'\N{SUPERSCRIPT THREE}' : ( '3', 0), 344 u'\N{SUPERSCRIPT TWO}' : ( '2', 0), 345 u'\N{VULGAR FRACTION ONE HALF}' : ( '{1/2}', 0), 346 u'\N{VULGAR FRACTION ONE QUARTER}' : ( '{1/4}', 0), 347 u'\N{VULGAR FRACTION THREE QUARTERS}' : ( '{3/4}', 0), 348 u'\N{YEN SIGN}' : ('yen', 0) 349 } 350 try: 351 unistring = unicode(inputstring, 'ascii') 352 return inputstring # inputstring is ascii, nothing to do 353 except UnicodeDecodeError: 354 pass 355 if isinstance(inputstring, unicode): 356 unistring = inputstring 357 else: 358 # try to make string into unicode 359 encodings = ['utf8', 'latin_1', 'cp037', 'cp437' , 'cp850', 'cp852', 360 'cp863', 'cp865', 'cp1140', 'cp1250', 'cp1252', 361 'iso8859_15', 'mac_latin2', 'utf_16'] 362 found_encoding = 'ascii' 363 alphabet = u"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ :!," 364 max_score = 0.0 365 for encoding in encodings: 366 # the encoding that reaches the highest score in the translation of 367 # characters is assumed correct. The score is a weighted count of 368 # successful translations, divided by the number of total characters 369 try: 370 unistring = unicode(inputstring, encoding) 371 successcount = 0 372 totalcount = 0 373 for character in unistring: 374 totalcount += 1 375 if xlate.has_key(character): 376 # translated characters contribute with their defined 377 # weight 378 weight = xlate[character][1] 379 successcount += weight 380 if character in alphabet: 381 # characters that are in the standard alphabet are good 382 # They contribute with a weight of 2 383 successcount += 2 384 score = float(successcount) / float(totalcount) 385 if score > max_score: 386 # always take the encoding with the highest score 387 found_encoding = encoding 388 max_score = score 389 except UnicodeDecodeError: 390 # this encoding doesn't work. Try the next one. 391 continue 392 unistring = unicode(inputstring, found_encoding, 'replace') 393 result = '' 394 for character in unistring: 395 if xlate.has_key(character): 396 result += xlate[character][0] 397 elif ord(character) >= 0x80: 398 pass 399 else: 400 result += str(character) 401 return result

402

403 -def put_through_pager(displaystring, pager='less'):

404 """ Put displaystring through the 'less' pager """ 405 (temp_fd, tempname) = tempfile.mkstemp(".mail") 406 temp_fh = os.fdopen(temp_fd, "w") 407 temp_fh.write(displaystring) 408 temp_fh.close() 409 os.system("%s %s" % (pager, tempname)) 410 os.unlink(tempname)

411

412 -def references_from_header(header):

413 """ Extract the message ids from the "References" and "In-Reply-To" 414 Headers. 415 """ 416 id_pattern = re.compile('<\S+@\S+>') 417 result = set() 418 references = header['References'] 419 if references is not None: 420 for id in id_pattern.findall(references): 421 result.add(id) 422 reply_to = header['In-Reply-To'] 423 if reply_to is not None: 424 for id in id_pattern.findall(reply_to): 425 result.add(id) 426 return list(result)

427

Source Code for Module ProcImap.Utils.Processing