Commit c82bf464 authored by Romain Guillot's avatar Romain Guillot

Add w2c model parameter to preprocess to not load the model at each iteration

parent 23a28ab3
......@@ -15,7 +15,7 @@ parser = argparse.ArgumentParser()
parser.add_argument('-t', '--testing', action='store_true') # to use the testing database
parser.add_argument('-i', '--init', action='store_true') # to use the testing database
parser.add_argument('-r', '--reinit', action='store_true') # to use the testing database
parser.add_argument('-w', '--word2vec', action='store_true') # to train word2vec model
parser.add_argument('-w', '--w2c', action='store_true') # to train word2vec model
args = parser.parse_known_args()
# remove arguments to not interfere with unittest
......@@ -45,7 +45,7 @@ try:
except:
pass
try:
sys.argv.remove("--word2vec")
sys.argv.remove("--w2c")
except:
pass
......@@ -60,7 +60,7 @@ CORS(app)
app.config['TESTING'] = args[0].testing
app.config['INIT'] = args[0].init
app.config['REINIT'] = args[0].reinit
app.config['WORD2VEC'] = args[0].word2vec
app.config['WORD2VEC'] = args[0].w2c
from SmartRecruiting_BackEnd.data import DatabaseManager
dbManager = DatabaseManager()
......
......@@ -14,7 +14,7 @@ from SmartRecruiting_BackEnd.data.database import init_db, dbSession as dB
from SmartRecruiting_BackEnd import app
from SmartRecruiting_BackEnd.deeplearning.cnn.train import train, def_flags
from SmartRecruiting_BackEnd.deeplearning.cnn.eval import eval_all, save_eval, def_eval_flag
from SmartRecruiting_BackEnd.deeplearning.preprocess.preprocess import init, reinit, preprocess
from SmartRecruiting_BackEnd.deeplearning.preprocess.preprocess import init, reinit, preprocess, load_word2vec
from SmartRecruiting_BackEnd.deeplearning.preprocess.database_handler import descriptor_to_string
......@@ -170,7 +170,7 @@ class DatabaseManager:
return -1
def add_offer_link_field(self, title, content, id_user, id_field, inbase):
descriptor = preprocess(content)
descriptor = preprocess(content, load_word2vec())
descriptor = descriptor_to_string(descriptor)
id_offer = self.add_offer_v2(title, content, descriptor, id_user)
if id_offer != -1:
......
......@@ -40,7 +40,8 @@ def formation_by_offer(text):
# Eval Parameters
FLAGS = tf.flags.FLAGS
x_test = preprocess.preprocess(text)
model = preprocess.load_word2vec()
x_test = preprocess.preprocess(text, model)
# print(x_test)
checkpoint_file = check_path()
graph = tf.Graph()
......
......@@ -23,6 +23,76 @@ STOP_LIST = stop_words()
max_size = CONFIG['max_size_offers'] # The maximal size that a text should have.
def build_word2vec(offers=None):
"""
This function builds a Word2Vec neural network model. This model produces vector for each word in our vocabulary.
Word2vec also build our vocabulary with our [sentences].
.. info::
Model parameters can be changed in the config file.
.. info::
We have also add a padding words in the vocabulary to be able to pad texts with this word
:return: Word2Vec model trained on [sentences]
:rtype: Word2Vec
"""
print("Build word2vec model ...")
filename = CONFIG['offers_dataset']
sentences = []
if offers is None:
with open(filename, encoding='utf-8', mode="r") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
print('init offer ' + str(i + 1), end="\r", flush=True)
text = row[CONFIG['feature']] # Get the text from the initial offer
sentences += [tokenize(text)] # Sentences used to build the model's vocabulary
else:
for i, o in enumerate(offers):
print('reinit offer ' + str(i + 1), end="\r", flush=True)
text = o['content']
sentences = sentences + [tokenize(text)] # Sentences used to build the model's vocabulary
print() # Print a new line to not override lines below
limit = CONFIG['word2vec_maxlinescorpus']
i = 0
with open("./data/wikipediaTXT.txt", encoding='iso-8859-1', mode='r') as f:
line = f.readline()
while line and i < limit:
cleaned = tokenize(line)
sentences += [cleaned]
line = f.readline()
i += 1
if i % 10000 == 0:
print("Still " + str(limit - i) + " lines to process.", flush=True, end="\r")
model = Word2Vec(sentences, size=CONFIG['word_vector_dim'], window=20, min_count=CONFIG['word2vec_minfreq'], workers=4)
model.wv[CONFIG['padding_str']] = np.array([0]*CONFIG['word_vector_dim']) # Add a descriptor for the padding word
model.save(CONFIG['word2vec_save'])
if CONFIG['debug']['visualisation_w2c'] == 1:
plot_words(model, max_words=CONFIG['debug']['max_word_to_plot'])
return model
def load_word2vec():
"""
:return:
:rtype:
"""
print("Load word2vec model ...")
try:
model = Word2Vec.load(CONFIG['word2vec_save'])
if CONFIG['debug']['visualisation_w2c'] == 1:
plot_words(model, max_words=CONFIG['debug']['max_word_to_plot'])
return model
except FileNotFoundError:
error(message="Unable to load word2vec model.\nConsider using --word2vec to build the model.")
exit(0)
error(unexpected=True) # Unreachable code if model loaded, else error, exit the program
exit(0)
def tokenize(text):
"""
Function to tokenize the [text]. Following steps are done to tokenize the [text] :
......@@ -51,7 +121,7 @@ def tokenize(text):
return res
def preprocess(text):
def preprocess(text, model):
"""
Function to get a text descriptor from the [text]. Following steps are done :
1. Tokenize text (see tokenize function)
......@@ -61,11 +131,13 @@ def preprocess(text):
:param text: the input text
:rtype text: str
:param model:
:type model:
:return: Word representations (each word is a float vector)
:rtype: list of numpy array
"""
cleaned = tokenize(text)
model = Word2Vec.load(CONFIG['word2vec_save'])
words = list(filter(lambda x: x in model.wv.vocab, cleaned)) # keep only words present in the model vocabulary
# Trunc or padding text to normalize length
......@@ -88,13 +160,14 @@ def preprocess_all_and_add_to_database(db_manager, file_name):
:param db_manager: database manager
:rtype db_manager: DatabaseManager
"""
model = load_word2vec()
with open(file_name, encoding='utf-8', mode="r") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
print('Preprocess offer ' + str(i + 1), end="\r", flush=True)
text = row[CONFIG['feature']] # Get the text from the initial offer
label = row[CONFIG['label']]
text_descriptor = preprocess(text)
text_descriptor = preprocess(text, model)
add_offer_to_database(db_manager, (text, text_descriptor, label))
......@@ -109,85 +182,17 @@ def recompute_all_descriptors(offers):
:return: a list of (id, descriptors)
:rtype:
"""
model = load_word2vec()
res = []
for i, offer in enumerate(offers):
print('recompute offer ' + str(i + 1))
desc = preprocess(offer['content'])
desc = preprocess(offer['content'], model)
_id = offer['id']
res += [{'id': _id,
'desc': desc}]
return res
def build_word2vec(offers=None):
"""
This function builds a Word2Vec neural network model. This model produces vector for each word in our vocabulary.
Word2vec also build our vocabulary with our [sentences].
.. info::
Model parameters can be changed in the config file.
.. info::
We have also add a padding words in the vocabulary to be able to pad texts with this word
:return: Word2Vec model trained on [sentences]
:rtype: Word2Vec
"""
print("Build word2vec model ...")
filename = CONFIG['offers_dataset']
sentences = []
if offers is None:
with open(filename, encoding='utf-8', mode="r") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader):
print('init offer ' + str(i + 1), end="\r", flush=True)
text = row[CONFIG['feature']] # Get the text from the initial offer
sentences += [tokenize(text)] # Sentences used to build the model's vocabulary
else:
for i, o in enumerate(offers):
print('reinit offer ' + str(i + 1))
text = o['content']
sentences = sentences + [tokenize(text)] # Sentences used to build the model's vocabulary
limit = CONFIG['word2vec_maxlinescorpus']
i = 0
with open("./data/wikipediaTXT.txt", encoding='iso-8859-1', mode='r') as f:
line = f.readline()
while line and i < limit:
cleaned = tokenize(line)
sentences += [cleaned]
line = f.readline()
i += 1
if i % 10000 == 0:
print("Still " + str(limit - i) + " lines to process.", flush=True, end="\r")
model = Word2Vec(sentences, size=CONFIG['word_vector_dim'], window=20, min_count=CONFIG['word2vec_minfreq'], workers=4)
model.wv[CONFIG['padding_str']] = np.array([0]*CONFIG['word_vector_dim']) # Add a descriptor for the padding word
model.save(CONFIG['word2vec_save'])
if CONFIG['debug']['visualisation_w2c'] == 1:
plot_words(model, max_words=CONFIG['debug']['max_word_to_plot'])
return model
def load_word2vec():
"""
:return:
:rtype:
"""
print("Load word2vec model ...")
try:
model = Word2Vec.load(CONFIG['word2vec_save'])
if CONFIG['debug']['visualisation_w2c'] == 1:
plot_words(model, max_words=CONFIG['debug']['max_word_to_plot'])
return model
except FileNotFoundError:
error(message="Unable to load word2vec model.\nConsider using --word2vec to build the model.")
exit(0)
error(unexpected=True) # Unreachable code if model loaded, else error, exit the program
exit(0)
def init(db_manager, generate_w2c=False):
"""
Function to initialize the model and BDD
......
import yaml
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
class Colors:
HEADER = '\033[95m'
......
......@@ -9,7 +9,7 @@ sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
sys.argv.append('-t')
from SmartRecruiting_BackEnd import app
from SmartRecruiting_BackEnd.deeplearning.preprocess.preprocess import tokenize, preprocess
from SmartRecruiting_BackEnd.deeplearning.preprocess.preprocess import tokenize, preprocess, load_word2vec
from SmartRecruiting_BackEnd.deeplearning.preprocess.database_handler import descriptor_to_string
app.config['stop_list'] = ['is', 'this', 'a', 'the']
......@@ -26,8 +26,8 @@ class TestUser(unittest.TestCase):
self.assertEqual(tokenize(test_text), ['hello', 'test'])
def test_same_descriptor(self):
desc1 = preprocess(test_text_fr)
desc2 = preprocess(test_text_fr)
desc1 = preprocess(test_text_fr, load_word2vec())
desc2 = preprocess(test_text_fr, load_word2vec())
for i in range(0, len(desc1)):
self.assertListEqual(list(desc1[i]), list(desc2[i]))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment