Skip to content

Spam Classifier

classify_email(email)

Classify spam possibility of given email.

Parameters:

Name Type Description Default
email str

Raw e-mail.

required

Returns:

Type Description
int

Spam or not.

Source code in spampy/spam_classifier.py
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
def classify_email(email: str) -> int:
    """
    Classify spam possibility of given email.
    Args:
      email (str):
        Raw e-mail.
    Returns:
      Spam or not.
    """

    train_svm()
    vocablary_dict = email_processor.get_vocablary_dict()
    feature_vector = email_processor.feature_vector_from_email(email, vocablary_dict)
    double_dimesion_email = np.reshape(feature_vector, (-1, 1899))
    spam_prediction = linear_svm.predict(double_dimesion_email)
    return spam_prediction

classify_email_with_enron(email)

Classify spam possibility of given email with enron dataset.

Parameters:

Name Type Description Default
email str

Raw e-mail.

required

Returns:

Type Description
int

Spam or not.

Source code in spampy/spam_classifier.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def classify_email_with_enron(email: str) -> int:
    """
    Classify spam possibility of given email with enron dataset.
    Args:
      email (str):
        Raw e-mail.
    Returns:
      Spam or not.
    """

    vocablary_dict = email_processor.create_enron_dictionary()
    feature_vector = email_processor.feature_vector_from_email(email, vocablary_dict)
    double_dimesion_email = np.reshape(feature_vector, (-1, 3000))
    if (
        os.path.exists("enron_features_matrix.npy")
        == False & os.path.exists("enron_labels.npy")
        == False
    ):
        features_matrix, labels = email_processor.extract_enron_features()
        np.save("enron_features_matrix.npy", features_matrix)
        np.save("enron_labels.npy", labels)
    else:
        features_matrix = np.load("enron_features_matrix.npy")
        labels = np.load("enron_labels.npy")
    X_train, _, y_train, _ = train_test_split(features_matrix, labels, test_size=0.40)
    linear_svc.fit(X_train, y_train)
    return linear_svc.predict(double_dimesion_email)

load_test_set()

Load test set and return features and labels.

Returns:

Type Description
Tuple[List, List]

Test features and labels.

Source code in spampy/spam_classifier.py
37
38
39
40
41
42
43
44
45
46
47
def load_test_set() -> Tuple[List, List]:
    """
    Load test set and return features and labels.
    Returns:
      Test features and labels.
    """

    training_set = join(parent_directory_path, "datasets/spamTest.mat")
    dataset = sio.loadmat(training_set)
    Xtest, ytest = dataset["Xtest"], dataset["ytest"]
    return Xtest, ytest

load_training_set()

Load training set and return features and labels.

Returns:

Type Description
Tuple[List, List]

Training features and labels.

Source code in spampy/spam_classifier.py
23
24
25
26
27
28
29
30
31
32
33
34
def load_training_set() -> Tuple[List, List]:
    """
    Load training set and return features and labels.
    Returns:
      Training features and labels.
    """

    # Training set
    training_set = join(parent_directory_path, "datasets/spamTrain.mat")
    dataset = sio.loadmat(training_set)
    X, y = dataset["X"], dataset["y"]
    return X, y

train_svm()

Fit SVM with features and labels.

Source code in spampy/spam_classifier.py
50
51
52
53
54
55
56
def train_svm():
    """
    Fit SVM with features and labels.
    """

    X, y = load_training_set()
    linear_svm.fit(X, y.flatten())