Vous avez reçu un message "Your GitLab account has been locked ..." ? Pas d'inquiétude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit b8a23d22 authored by Arnaud Bey's avatar Arnaud Bey
Browse files

now read lexicon line by line

parent 702a56a1
......@@ -58,8 +58,8 @@ class ImportManager
// Gestion du lexique
$pathFileLexicon = $pathLexiconDir.DIRECTORY_SEPARATOR."lexicon.tsv";
$linesLexicon = file($pathFileLexicon);
$this->parseTSVlexicon($linesLexicon, $specs);
//$linesLexicon = file($pathFileLexicon);
$this->parseTSVlexicon($pathFileLexicon, $specs);
return;
}
......@@ -127,7 +127,7 @@ class ImportManager
return $specs;
}
public function parseTSVlexicon($linesLexicon, $specs)
public function parseTSVlexicon($pathFileLexicon, $specs)
{
$this->em->getConnection()->getConfiguration()->setSQLLogger(null);
......@@ -141,80 +141,85 @@ class ImportManager
$flushCpt = 0;
$cpt = 0;
$maxToFlush = 5000;
$total = count($linesLexicon);
$total = count(file($pathFileLexicon));
$bigrams=[];
$stopwatchName = uniqid();
$stopwatch = new Stopwatch();
$stopwatch->start($stopwatchName);
foreach ($linesLexicon as $line) {
if ($flushCpt !== 0) {
if (preg_match_all("/^([^\t]+)\t([^\t]+)\t([^\t]+)\s*$/", $line, $matches)) {
$wordValue = $matches[1][0];
$rootValue = $matches[2][0];
#mb_eregi_replace ?
$cleanWordValue = str_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
// $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
$string2print="ROOT = ".$rootValue." / CLEAN = ".$cleanWordValue;
//$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
$cleanWordValue = mb_eregi_replace("/\P{L}/", "", $cleanWordValue);#fonctionne pour le russe
$string2print.=" / EREGI = ".$cleanWordValue."\n";
// Gestion de la root
$root = $this->rm->findOrCreate($language, $rootValue, $roots);
// Gestion des features
$labelsNValues = explode(",", $matches[3][0]);
$features = [];
foreach ($labelsNValues as $labelNValue) {
$featureStringTab = explode("=", $labelNValue);
$features[] = $this->fm->findOrCreate($language, $featureStringTab[0], $featureStringTab[1]);
}
$this->wm->create($language, $root, $features, $wordValue, $cleanWordValue);
// Gestion des lettres et débuts de mots
$wordsLetters = preg_split('//u', $cleanWordValue, null, PREG_SPLIT_NO_EMPTY);
$wordStartString = "";
$previousLetter="";
foreach ($wordsLetters as $wordLetter) {
$wordStartString .= $wordLetter;
if (!in_array($wordStartString, $wordStarts) && mb_strlen($wordStartString) > 1) {
$wordStarts[] = $wordStartString;
$handle = @fopen($pathFileLexicon, "r");
if ($handle) {
while (($line = fgets($handle, 4096)) !== false) {
if ($flushCpt !== 0) {
if (preg_match_all("/^([^\t]+)\t([^\t]+)\t([^\t]+)\s*$/", $line, $matches)) {
$wordValue = $matches[1][0];
$rootValue = $matches[2][0];
#mb_eregi_replace ?
$cleanWordValue = str_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
// $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
$string2print="ROOT = ".$rootValue." / CLEAN = ".$cleanWordValue;
//$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
$cleanWordValue = mb_eregi_replace("/\P{L}/", "", $cleanWordValue);#fonctionne pour le russe
$string2print.=" / EREGI = ".$cleanWordValue."\n";
// Gestion de la root
$root = $this->rm->findOrCreate($language, $rootValue, $roots);
// Gestion des features
$labelsNValues = explode(",", $matches[3][0]);
$features = [];
foreach ($labelsNValues as $labelNValue) {
$featureStringTab = explode("=", $labelNValue);
$features[] = $this->fm->findOrCreate($language, $featureStringTab[0], $featureStringTab[1]);
}
if (!in_array($wordLetter, $letters)) {
$letters[] = $wordLetter;
$this->wm->create($language, $root, $features, $wordValue, $cleanWordValue);
// Gestion des lettres et débuts de mots
$wordsLetters = preg_split('//u', $cleanWordValue, null, PREG_SPLIT_NO_EMPTY);
$wordStartString = "";
$previousLetter="";
foreach ($wordsLetters as $wordLetter) {
$wordStartString .= $wordLetter;
if (!in_array($wordStartString, $wordStarts) && mb_strlen($wordStartString) > 1) {
$wordStarts[] = $wordStartString;
}
if (!in_array($wordLetter, $letters)) {
$letters[] = $wordLetter;
}
//Gestion des $bigram
if ($previousLetter != "") {
$bigramString = $previousLetter.$wordLetter;
$bigrams[$bigramString] = !array_key_exists($bigramString, $bigrams) ? 1 : $bigrams[$bigramString] + 1;
}
$previousLetter = $wordLetter;
}
//Gestion des $bigram
if ($previousLetter != "") {
$bigramString = $previousLetter.$wordLetter;
$bigrams[$bigramString] = !array_key_exists($bigramString, $bigrams) ? 1 : $bigrams[$bigramString] + 1;
if ($flushCpt == $maxToFlush) {
$this->wm->createStarts($language, $wordStarts);
$wordStarts = null;
$wordStarts = [];
$this->flushAndFreeMemory();
$flushCpt = 1;
$roots = null;
$roots = [];
$languageId = $specs["language_id"];
$language = $this->em->getRepository(Language::class)->find($languageId);
$percent = round($cpt / $total * 100, 2);
echo("[".$percent."%] ".$wordValue."\n");
$event = $stopwatch->stop($stopwatchName);
$stopwatchName = uniqid();
echo "max memory > " . $event->getMemory()/1048576 . " MB \n";
echo "duration > " . $event->getDuration()/1000 . " seconds \n\n";
$stopwatch->start($stopwatchName);
}
$previousLetter = $wordLetter;
}
if ($flushCpt == $maxToFlush) {
$this->wm->createStarts($language, $wordStarts);
$wordStarts = null;
$wordStarts = [];
$this->flushAndFreeMemory();
$flushCpt = 1;
$roots = null;
$roots = [];
$languageId = $specs["language_id"];
$language = $this->em->getRepository(Language::class)->find($languageId);
$percent = round($cpt / $total * 100, 2);
echo("[".$percent."%] ".$wordValue."\n");
$event = $stopwatch->stop($stopwatchName);
$stopwatchName = uniqid();
echo "max memory > " . $event->getMemory()/1048576 . " MB \n";
echo "duration > " . $event->getDuration()/1000 . " seconds \n\n";
$stopwatch->start($stopwatchName);
}
}
$cpt++;
$flushCpt++;
}
$cpt++;
$flushCpt++;
fclose($handle);
}
$this->bgm->generateBigrams($bigrams, $pathLexiconDir);
echo("Bigram OK \n");
$this->wm->createStarts($language, $wordStarts);
......
......@@ -49,8 +49,6 @@ class WordManager
return $word;
}
public function createStarts(Language $language, $wordStarts)
{
$languageId = $language->getId();
......@@ -66,7 +64,6 @@ class WordManager
return;
}
public function recalculate(Word $word)
{
$language = $word->getLanguage();
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment