Commit b8a23d22 authored by Arnaud Bey's avatar Arnaud Bey
Browse files

now read lexicon line by line

parent 702a56a1
...@@ -58,8 +58,8 @@ class ImportManager ...@@ -58,8 +58,8 @@ class ImportManager
// Gestion du lexique // Gestion du lexique
$pathFileLexicon = $pathLexiconDir.DIRECTORY_SEPARATOR."lexicon.tsv"; $pathFileLexicon = $pathLexiconDir.DIRECTORY_SEPARATOR."lexicon.tsv";
$linesLexicon = file($pathFileLexicon); //$linesLexicon = file($pathFileLexicon);
$this->parseTSVlexicon($linesLexicon, $specs); $this->parseTSVlexicon($pathFileLexicon, $specs);
return; return;
} }
...@@ -127,7 +127,7 @@ class ImportManager ...@@ -127,7 +127,7 @@ class ImportManager
return $specs; return $specs;
} }
public function parseTSVlexicon($linesLexicon, $specs) public function parseTSVlexicon($pathFileLexicon, $specs)
{ {
$this->em->getConnection()->getConfiguration()->setSQLLogger(null); $this->em->getConnection()->getConfiguration()->setSQLLogger(null);
...@@ -141,80 +141,85 @@ class ImportManager ...@@ -141,80 +141,85 @@ class ImportManager
$flushCpt = 0; $flushCpt = 0;
$cpt = 0; $cpt = 0;
$maxToFlush = 5000; $maxToFlush = 5000;
$total = count($linesLexicon); $total = count(file($pathFileLexicon));
$bigrams=[]; $bigrams=[];
$stopwatchName = uniqid(); $stopwatchName = uniqid();
$stopwatch = new Stopwatch(); $stopwatch = new Stopwatch();
$stopwatch->start($stopwatchName); $stopwatch->start($stopwatchName);
foreach ($linesLexicon as $line) { $handle = @fopen($pathFileLexicon, "r");
if ($flushCpt !== 0) { if ($handle) {
if (preg_match_all("/^([^\t]+)\t([^\t]+)\t([^\t]+)\s*$/", $line, $matches)) { while (($line = fgets($handle, 4096)) !== false) {
$wordValue = $matches[1][0]; if ($flushCpt !== 0) {
$rootValue = $matches[2][0]; if (preg_match_all("/^([^\t]+)\t([^\t]+)\t([^\t]+)\s*$/", $line, $matches)) {
#mb_eregi_replace ? $wordValue = $matches[1][0];
$cleanWordValue = str_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue); $rootValue = $matches[2][0];
// $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue); #mb_eregi_replace ?
$cleanWordValue = str_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
$string2print="ROOT = ".$rootValue." / CLEAN = ".$cleanWordValue; // $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
//$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
$cleanWordValue = mb_eregi_replace("/\P{L}/", "", $cleanWordValue);#fonctionne pour le russe $string2print="ROOT = ".$rootValue." / CLEAN = ".$cleanWordValue;
$string2print.=" / EREGI = ".$cleanWordValue."\n"; //$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
// Gestion de la root $cleanWordValue = mb_eregi_replace("/\P{L}/", "", $cleanWordValue);#fonctionne pour le russe
$root = $this->rm->findOrCreate($language, $rootValue, $roots); $string2print.=" / EREGI = ".$cleanWordValue."\n";
// Gestion de la root
// Gestion des features $root = $this->rm->findOrCreate($language, $rootValue, $roots);
$labelsNValues = explode(",", $matches[3][0]);
$features = []; // Gestion des features
foreach ($labelsNValues as $labelNValue) { $labelsNValues = explode(",", $matches[3][0]);
$featureStringTab = explode("=", $labelNValue); $features = [];
$features[] = $this->fm->findOrCreate($language, $featureStringTab[0], $featureStringTab[1]); foreach ($labelsNValues as $labelNValue) {
} $featureStringTab = explode("=", $labelNValue);
$this->wm->create($language, $root, $features, $wordValue, $cleanWordValue); $features[] = $this->fm->findOrCreate($language, $featureStringTab[0], $featureStringTab[1]);
// Gestion des lettres et débuts de mots
$wordsLetters = preg_split('//u', $cleanWordValue, null, PREG_SPLIT_NO_EMPTY);
$wordStartString = "";
$previousLetter="";
foreach ($wordsLetters as $wordLetter) {
$wordStartString .= $wordLetter;
if (!in_array($wordStartString, $wordStarts) && mb_strlen($wordStartString) > 1) {
$wordStarts[] = $wordStartString;
} }
if (!in_array($wordLetter, $letters)) { $this->wm->create($language, $root, $features, $wordValue, $cleanWordValue);
$letters[] = $wordLetter;
// Gestion des lettres et débuts de mots
$wordsLetters = preg_split('//u', $cleanWordValue, null, PREG_SPLIT_NO_EMPTY);
$wordStartString = "";
$previousLetter="";
foreach ($wordsLetters as $wordLetter) {
$wordStartString .= $wordLetter;
if (!in_array($wordStartString, $wordStarts) && mb_strlen($wordStartString) > 1) {
$wordStarts[] = $wordStartString;
}
if (!in_array($wordLetter, $letters)) {
$letters[] = $wordLetter;
}
//Gestion des $bigram
if ($previousLetter != "") {
$bigramString = $previousLetter.$wordLetter;
$bigrams[$bigramString] = !array_key_exists($bigramString, $bigrams) ? 1 : $bigrams[$bigramString] + 1;
}
$previousLetter = $wordLetter;
} }
//Gestion des $bigram if ($flushCpt == $maxToFlush) {
if ($previousLetter != "") { $this->wm->createStarts($language, $wordStarts);
$bigramString = $previousLetter.$wordLetter; $wordStarts = null;
$bigrams[$bigramString] = !array_key_exists($bigramString, $bigrams) ? 1 : $bigrams[$bigramString] + 1; $wordStarts = [];
$this->flushAndFreeMemory();
$flushCpt = 1;
$roots = null;
$roots = [];
$languageId = $specs["language_id"];
$language = $this->em->getRepository(Language::class)->find($languageId);
$percent = round($cpt / $total * 100, 2);
echo("[".$percent."%] ".$wordValue."\n");
$event = $stopwatch->stop($stopwatchName);
$stopwatchName = uniqid();
echo "max memory > " . $event->getMemory()/1048576 . " MB \n";
echo "duration > " . $event->getDuration()/1000 . " seconds \n\n";
$stopwatch->start($stopwatchName);
} }
$previousLetter = $wordLetter;
}
if ($flushCpt == $maxToFlush) {
$this->wm->createStarts($language, $wordStarts);
$wordStarts = null;
$wordStarts = [];
$this->flushAndFreeMemory();
$flushCpt = 1;
$roots = null;
$roots = [];
$languageId = $specs["language_id"];
$language = $this->em->getRepository(Language::class)->find($languageId);
$percent = round($cpt / $total * 100, 2);
echo("[".$percent."%] ".$wordValue."\n");
$event = $stopwatch->stop($stopwatchName);
$stopwatchName = uniqid();
echo "max memory > " . $event->getMemory()/1048576 . " MB \n";
echo "duration > " . $event->getDuration()/1000 . " seconds \n\n";
$stopwatch->start($stopwatchName);
} }
} }
$cpt++;
$flushCpt++;
} }
$cpt++; fclose($handle);
$flushCpt++;
} }
$this->bgm->generateBigrams($bigrams, $pathLexiconDir); $this->bgm->generateBigrams($bigrams, $pathLexiconDir);
echo("Bigram OK \n"); echo("Bigram OK \n");
$this->wm->createStarts($language, $wordStarts); $this->wm->createStarts($language, $wordStarts);
......
...@@ -49,8 +49,6 @@ class WordManager ...@@ -49,8 +49,6 @@ class WordManager
return $word; return $word;
} }
public function createStarts(Language $language, $wordStarts) public function createStarts(Language $language, $wordStarts)
{ {
$languageId = $language->getId(); $languageId = $language->getId();
...@@ -66,7 +64,6 @@ class WordManager ...@@ -66,7 +64,6 @@ class WordManager
return; return;
} }
public function recalculate(Word $word) public function recalculate(Word $word)
{ {
$language = $word->getLanguage(); $language = $word->getLanguage();
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment