Skip to content

Email Processor

create_enron_dictionary(root_dir='spampy/datasets/enron')

A function which create a dictionary from enron dataset. Uses multiple process.

Parameters:

Name Type Description Default
root_dir str

Root folders for enron dataset.

'spampy/datasets/enron'
Source code in spampy/email_processor.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def create_enron_dictionary(root_dir: str = "spampy/datasets/enron") -> Dict:
    """
    A function which create a dictionary from enron dataset.
    Uses multiple process.

    Args:
      root_dir (str):
        Root folders for enron dataset.
    """

    manager = mp.Manager()
    return_dict = manager.dict()
    jobs = []
    emails_dirs = [os.path.join(root_dir, f) for f in listdir(root_dir)]
    for emails_dir in emails_dirs:
        p = mp.Process(target=enron_processor, args=(emails_dir, return_dict))
        jobs.append(p)
        p.start()

    for proc in jobs:
        proc.join()

    dictionary = return_dict["all_words"]
    list_to_remove = return_dict["list_to_remove"]

    for item in list_to_remove:
        if item.isalpha() == False:
            del dictionary[item]
        elif len(item) == 1:
            del dictionary[item]
    dictionary = dictionary.most_common(3000)
    np.save("dict_enron.npy", dictionary)
    return dictionary

create_tokenlist(email)

Tokenizes it, creates a list of tokens in the e-mail.

Parameters:

Name Type Description Default
email str

Raw e-mail

required

Returns:

Type Description
List

Ordered list of tokens in the e-mail.

Source code in spampy/email_processor.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
def create_tokenlist(email: str) -> List:
    """
    Tokenizes it, creates a list of tokens in the e-mail.
    Args:
      email (str):
        Raw e-mail
    Returns:
      Ordered list of tokens in the e-mail.
    """

    # use NLTK porter stemmer
    stemmer = nltk.stem.porter.PorterStemmer()
    email = preprocess(email)
    # Split the e-mail into single words by ' ', '@', '$', '/', ...
    tokens = re.split(
        r"[ \@\$\/\#\.\-\:\&\*\+\=\[\]\?\!\(\)\{\}\,\'\"\>\_\<\;\%]", email
    )
    # Loop over each word and use a stemmer to shorten it,
    tokenlist = []
    for token in tokens:
        # Remove any non alphanumeric characters
        token = re.sub("[^a-zA-Z0-9]", "", token)
        # Use the Porter stemmer to stem the word
        stemmed = stemmer.stem(token)
        # Pass empty tokens
        if not len(token):
            continue
        # Save a list of all unique stemmed words
        tokenlist.append(stemmed)
    return tokenlist

enron_processor(emails_dir, return_dict)

A function which processes .txt email files into lists and returns in a dictionary.

Parameters:

Name Type Description Default
emails_dir str

Root folders for emails.

required
return_dict Dict

Shared dict for processed datas.

required
Source code in spampy/email_processor.py
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
def enron_processor(emails_dir: str, return_dict: Dict) -> Dict:
    """
    A function which processes .txt email files into lists
    and returns in a dictionary.

    Args:
      emails_dir (str):
        Root folders for emails.
      return_dict (dict):
        Shared dict for processed datas.
    """

    all_words = []
    dirs = [os.path.join(emails_dir, f) for f in listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in listdir(d)]
        for mail in emails:
            with codecs.open(mail, "r", encoding="utf-8", errors="ignore") as m:
                for line in m:
                    words = line.split()
                    all_words += words
    dictionary = Counter(all_words)
    list_to_remove = list(dictionary.keys())
    return_dict["all_words"] = dictionary
    return_dict["list_to_remove"] = list_to_remove

extract_enron_features(root_dir='spampy/datasets/enron')

A function creates features and labels from enron dataset. Uses multiple process and returns in a tuple.

Parameters:

Name Type Description Default
root_dir str

Root folders for enron dataset.

'spampy/datasets/enron'
Source code in spampy/email_processor.py
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
def extract_enron_features(root_dir: str = "spampy/datasets/enron") -> Tuple:
    """
    A function creates features and labels from enron dataset.
    Uses multiple process and returns in a tuple.

    Args:
      root_dir (str):
        Root folders for enron dataset.
    """

    enron_dict = create_enron_dictionary(root_dir)
    manager = mp.Manager()
    return_dict = manager.dict()
    return_dict["enron_dict"] = enron_dict
    features_matrix = np.zeros((33716, 3000))
    train_labels = np.zeros(33716)
    return_dict["features_matrix"] = features_matrix
    return_dict["train_labels"] = train_labels
    jobs = []
    emails_dirs = [os.path.join(root_dir, f) for f in os.listdir(root_dir)]
    for emails_dir in emails_dirs:
        p = mp.Process(target=features_processor, args=(emails_dir, return_dict))
        jobs.append(p)
        p.start()

    for proc in jobs:
        proc.join()

    features_matrix = return_dict["features_matrix"]
    train_labels = return_dict["train_labels"]
    return np.array(features_matrix), np.array(train_labels)

feature_vector_from_email(email, vocablary_dict)

Returns a vector of shape (n,1) with a size of the vocablary_dict. If the vocab word with index == 1 is in the email, first element in this vector is 1, 0 otherwise.

Parameters:

Name Type Description Default
email str

E-mail.

required
vocablary_dict Dict

Vocablary dictionary created by get_vocablary_dict.

required
Source code in spampy/email_processor.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
def feature_vector_from_email(email: str, vocablary_dict: Dict) -> Dict:
    """
    Returns a vector of shape (n,1) with a size of the vocablary_dict.
    If the vocab word with index == 1 is in the email, first element in
    this vector is 1, 0 otherwise.
    Args:
      email (str):
        E-mail.
      vocablary_dict (dict):
        Vocablary dictionary created by `get_vocablary_dict`.
    """

    n = len(vocablary_dict)
    result = np.zeros((n, 1))
    vocablary_indices = get_vocablary_indices(email, vocablary_dict)
    for index in vocablary_indices:
        result[index] = 1
    return result

features_processor(emails_dir, return_dict)

A function which processes data features into lists and returns in a dictionary.

Parameters:

Name Type Description Default
emails_dir str

Root folders for emails.

required
return_dict Dict

Shared dict for processed datas.

required
Source code in spampy/email_processor.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def features_processor(emails_dir: str, return_dict: Dict) -> Dict:
    """
    A function which processes data features into lists
    and returns in a dictionary.

    Args:
      emails_dir (str):
        Root folders for emails.
      return_dict (dict):
        Shared dict for processed datas.
    """

    features_matrix = return_dict["features_matrix"]
    train_labels = return_dict["train_labels"]
    docID = 0
    enron_dict = return_dict["enron_dict"]
    dirs = [os.path.join(emails_dir, f) for f in os.listdir(emails_dir)]
    for d in dirs:
        emails = [os.path.join(d, f) for f in os.listdir(d)]
        for mail in emails:
            with open(mail) as m:
                all_words = []
                for line in m:
                    words = line.split()
                    all_words += words
                for word in all_words:
                    wordID = 0
                    for i, d in enumerate(enron_dict):
                        if d[0] == u"word":
                            wordID = i
                            features_matrix[docID, wordID] = all_words.count(word)
            train_labels[docID] = int(mail.split(".")[-2] == "spam")
            docID = docID + 1
    return_dict["features_matrix"] = features_matrix
    return_dict["train_labels"] = train_labels

get_vocablary_dict(path='spampy/datasets', filename='vocablary.txt')

Add vocablary text file content into a dictionary.

Parameters:

Name Type Description Default
path str

Vocablary file folder path.

'spampy/datasets'
filename str

Vocablary file name.

'vocablary.txt'

Returns:

Type Description
Dict

Vocablary dict.

Source code in spampy/email_processor.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
def get_vocablary_dict(
    path: str = "spampy/datasets", filename: str = "vocablary.txt"
) -> Dict:
    """
    Add vocablary text file content into a dictionary.
    Args:
      path (str):
        Vocablary file folder path.
      filename (str):
        Vocablary file name.
    Returns:
      Vocablary dict.
    """

    vocablary_dict = {}
    with open(os.path.join(path, filename), "r") as f:
        for line in f:
            (val, key) = line.split()
            vocablary_dict[int(val)] = key
    return vocablary_dict

get_vocablary_indices(email, vocablary_dict)

Returns a list of indices (location) of each stemmed word in email.

Parameters:

Name Type Description Default
email str

E-mail.

required
vocablary_dict Dict

Vocablary dictionary created by get_vocablary_dict.

required

Returns:

Type Description
List

Indices list.

Source code in spampy/email_processor.py
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def get_vocablary_indices(email: str, vocablary_dict: Dict) -> List:
    """
    Returns a list of indices (location) of each stemmed word in email.
    Args:
      email (str):
        E-mail.
      vocablary_dict (dict):
        Vocablary dictionary created by `get_vocablary_dict`.
    Returns:
      Indices list.
    """

    tokenlist = create_tokenlist(email)
    index_list = [
        vocablary_dict[token] for token in tokenlist if token in vocablary_dict
    ]
    return index_list

listdir(directory)

A specialized version of os.listdir() that ignores files that start with a leading period.

Especially dismissing .DS_STORE s.

Source code in spampy/email_processor.py
133
134
135
136
137
138
139
140
141
def listdir(directory: str) -> List:
    """
    A specialized version of os.listdir() that ignores files that
    start with a leading period.

    Especially dismissing .DS_STORE s.
    """
    filelist = os.listdir(directory)
    return [x for x in filelist if not (x.startswith("."))]

preprocess(email)

Preprocess (simplifies) raw email.

Parameters:

Name Type Description Default
email str

Raw e-mail

required

Returns:

Type Description
str

Processed (simplified) email

Source code in spampy/email_processor.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def preprocess(email: str) -> str:
    """
    Preprocess (simplifies) raw email.
    Args:
      email (str):
        Raw e-mail
    Returns:
      Processed (simplified) email
    """

    # Make e-mail lower case
    email = email.lower()
    # Strip html tags
    email = re.sub("<[^<>]+>", " ", email)
    # Any numbers get replaced with the string 'number'
    email = re.sub("[0-9]+", "number", email)
    # Any word starting with http or https:// replaced with 'httpaddr'
    email = re.sub(r"(http|https)://[^\s]*", "httpaddr", email)
    # Strings with "@" in the middle are considered emails --> 'emailaddr'
    email = re.sub(r"[^\s]+@[^\s]+", "emailaddr", email)
    # The '$' sign gets replaced with 'dollar'
    email = re.sub("[$]+", "dollar", email)
    return email