Models

Predefined topic labelling models, located at atnlp/share/models

Logistic regression (logreg)

atnlp/share/models/logreg.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# encoding: utf-8
"""
logreg.py
~~~~~~~~~

Pipeline converting raw text to sparse tfidf representation and feeding to
logistic regression for topic labelling.

"""
__author__ = "Will Davey"
__email__ = "wedavey@gmail.com"
__created__ = "2018-06-12"
__copyright__ = "Copyright 2018 Will Davey"
__license__ = "MIT https://opensource.org/licenses/MIT"

# imports
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# model
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', OneVsRestClassifier(LogisticRegression())),
    ])

# EOF

Support Vector Machine (svm)

atnlp/share/models/svm.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
# encoding: utf-8
"""
svm.py
~~~~~~

Pipeline converting raw text to sparse tfidf representation and feeding to
a support vector machine for topic labelling.

"""
__author__ = "Will Davey"
__email__ = "wedavey@gmail.com"
__created__ = "2018-06-07"
__copyright__ = "Copyright 2018 Will Davey"
__license__ = "MIT https://opensource.org/licenses/MIT"

# imports
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

# model
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('svc', OneVsRestClassifier(SVC())),
    ])

# EOF

Word Match (wordmatch)

atnlp/share/models/wordmatch.py
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# encoding: utf-8
"""
wordmatch.py
~~~~~~~~~~~~

Pipeline converting raw text to sparse bag-of-words representation and feeding to
a custom key-word based topic labelling model.

"""
__author__ = "Will Davey"
__email__ = "wedavey@gmail.com"
__created__ = "2018-06-07"
__copyright__ = "Copyright 2018 Will Davey"
__license__ = "MIT https://opensource.org/licenses/MIT"

# imports
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from atnlp.model.wordmatch import WordMatchClassifier

model = Pipeline([
    ('bow', CountVectorizer()),
    ('wmc', OneVsRestClassifier(WordMatchClassifier())),
    ])

# EOF