Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Mathieu Loiseau
MagicWord
Commits
b8a23d22
Commit
b8a23d22
authored
May 21, 2019
by
Arnaud Bey
Browse files
now read lexicon line by line
parent
702a56a1
Changes
2
Hide whitespace changes
Inline
Side-by-side
application/src/LexiconBundle/Manager/ImportManager.php
View file @
b8a23d22
...
@@ -58,8 +58,8 @@ class ImportManager
...
@@ -58,8 +58,8 @@ class ImportManager
// Gestion du lexique
// Gestion du lexique
$pathFileLexicon
=
$pathLexiconDir
.
DIRECTORY_SEPARATOR
.
"lexicon.tsv"
;
$pathFileLexicon
=
$pathLexiconDir
.
DIRECTORY_SEPARATOR
.
"lexicon.tsv"
;
$linesLexicon
=
file
(
$pathFileLexicon
);
//
$linesLexicon = file($pathFileLexicon);
$this
->
parseTSVlexicon
(
$
lines
Lexicon
,
$specs
);
$this
->
parseTSVlexicon
(
$
pathFile
Lexicon
,
$specs
);
return
;
return
;
}
}
...
@@ -127,7 +127,7 @@ class ImportManager
...
@@ -127,7 +127,7 @@ class ImportManager
return
$specs
;
return
$specs
;
}
}
public
function
parseTSVlexicon
(
$
lines
Lexicon
,
$specs
)
public
function
parseTSVlexicon
(
$
pathFile
Lexicon
,
$specs
)
{
{
$this
->
em
->
getConnection
()
->
getConfiguration
()
->
setSQLLogger
(
null
);
$this
->
em
->
getConnection
()
->
getConfiguration
()
->
setSQLLogger
(
null
);
...
@@ -141,80 +141,85 @@ class ImportManager
...
@@ -141,80 +141,85 @@ class ImportManager
$flushCpt
=
0
;
$flushCpt
=
0
;
$cpt
=
0
;
$cpt
=
0
;
$maxToFlush
=
5000
;
$maxToFlush
=
5000
;
$total
=
count
(
$lines
Lexicon
);
$total
=
count
(
file
(
$pathFile
Lexicon
)
)
;
$bigrams
=
[];
$bigrams
=
[];
$stopwatchName
=
uniqid
();
$stopwatchName
=
uniqid
();
$stopwatch
=
new
Stopwatch
();
$stopwatch
=
new
Stopwatch
();
$stopwatch
->
start
(
$stopwatchName
);
$stopwatch
->
start
(
$stopwatchName
);
foreach
(
$linesLexicon
as
$line
)
{
$handle
=
@
fopen
(
$pathFileLexicon
,
"r"
);
if
(
$flushCpt
!==
0
)
{
if
(
$handle
)
{
if
(
preg_match_all
(
"/^([^
\t
]+)
\t
([^
\t
]+)
\t
([^
\t
]+)\s*$/"
,
$line
,
$matches
))
{
while
((
$line
=
fgets
(
$handle
,
4096
))
!==
false
)
{
$wordValue
=
$matches
[
1
][
0
];
if
(
$flushCpt
!==
0
)
{
$rootValue
=
$matches
[
2
][
0
];
if
(
preg_match_all
(
"/^([^
\t
]+)
\t
([^
\t
]+)
\t
([^
\t
]+)\s*$/"
,
$line
,
$matches
))
{
#mb_eregi_replace ?
$wordValue
=
$matches
[
1
][
0
];
$cleanWordValue
=
str_replace
(
$specs
[
"rewriteFrom"
],
$specs
[
"rewriteTo"
],
$wordValue
);
$rootValue
=
$matches
[
2
][
0
];
// $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
#mb_eregi_replace ?
$cleanWordValue
=
str_replace
(
$specs
[
"rewriteFrom"
],
$specs
[
"rewriteTo"
],
$wordValue
);
$string2print
=
"ROOT = "
.
$rootValue
.
" / CLEAN = "
.
$cleanWordValue
;
// $cleanWordValue = preg_replace($specs["rewriteFrom"], $specs["rewriteTo"], $wordValue);
//$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
$cleanWordValue
=
mb_eregi_replace
(
"/\P
{
L
}
/"
,
""
,
$cleanWordValue
);
#fonctionne pour le russe
$string2print
=
"ROOT = "
.
$rootValue
.
" / CLEAN = "
.
$cleanWordValue
;
$string2print
.
=
" / EREGI = "
.
$cleanWordValue
.
"
\n
"
;
//$cleanWordValue = preg_replace("/\P{L}/", "", $cleanWordValue);#bug pour le russe
// Gestion de la root
$cleanWordValue
=
mb_eregi_replace
(
"/\P
{
L
}
/"
,
""
,
$cleanWordValue
);
#fonctionne pour le russe
$root
=
$this
->
rm
->
findOrCreate
(
$language
,
$rootValue
,
$roots
);
$string2print
.
=
" / EREGI = "
.
$cleanWordValue
.
"
\n
"
;
// Gestion de la root
// Gestion des features
$root
=
$this
->
rm
->
findOrCreate
(
$language
,
$rootValue
,
$roots
);
$labelsNValues
=
explode
(
","
,
$matches
[
3
][
0
]);
$features
=
[];
// Gestion des features
foreach
(
$labelsNValues
as
$labelNValue
)
{
$labelsNValues
=
explode
(
","
,
$matches
[
3
][
0
]);
$featureStringTab
=
explode
(
"="
,
$labelNValue
);
$features
=
[];
$features
[]
=
$this
->
fm
->
findOrCreate
(
$language
,
$featureStringTab
[
0
],
$featureStringTab
[
1
]);
foreach
(
$labelsNValues
as
$labelNValue
)
{
}
$featureStringTab
=
explode
(
"="
,
$labelNValue
);
$this
->
wm
->
create
(
$language
,
$root
,
$features
,
$wordValue
,
$cleanWordValue
);
$features
[]
=
$this
->
fm
->
findOrCreate
(
$language
,
$featureStringTab
[
0
],
$featureStringTab
[
1
]);
// Gestion des lettres et débuts de mots
$wordsLetters
=
preg_split
(
'//u'
,
$cleanWordValue
,
null
,
PREG_SPLIT_NO_EMPTY
);
$wordStartString
=
""
;
$previousLetter
=
""
;
foreach
(
$wordsLetters
as
$wordLetter
)
{
$wordStartString
.
=
$wordLetter
;
if
(
!
in_array
(
$wordStartString
,
$wordStarts
)
&&
mb_strlen
(
$wordStartString
)
>
1
)
{
$wordStarts
[]
=
$wordStartString
;
}
}
if
(
!
in_array
(
$wordLetter
,
$letters
))
{
$this
->
wm
->
create
(
$language
,
$root
,
$features
,
$wordValue
,
$cleanWordValue
);
$letters
[]
=
$wordLetter
;
// Gestion des lettres et débuts de mots
$wordsLetters
=
preg_split
(
'//u'
,
$cleanWordValue
,
null
,
PREG_SPLIT_NO_EMPTY
);
$wordStartString
=
""
;
$previousLetter
=
""
;
foreach
(
$wordsLetters
as
$wordLetter
)
{
$wordStartString
.
=
$wordLetter
;
if
(
!
in_array
(
$wordStartString
,
$wordStarts
)
&&
mb_strlen
(
$wordStartString
)
>
1
)
{
$wordStarts
[]
=
$wordStartString
;
}
if
(
!
in_array
(
$wordLetter
,
$letters
))
{
$letters
[]
=
$wordLetter
;
}
//Gestion des $bigram
if
(
$previousLetter
!=
""
)
{
$bigramString
=
$previousLetter
.
$wordLetter
;
$bigrams
[
$bigramString
]
=
!
array_key_exists
(
$bigramString
,
$bigrams
)
?
1
:
$bigrams
[
$bigramString
]
+
1
;
}
$previousLetter
=
$wordLetter
;
}
}
//Gestion des $bigram
if
(
$flushCpt
==
$maxToFlush
)
{
if
(
$previousLetter
!=
""
)
{
$this
->
wm
->
createStarts
(
$language
,
$wordStarts
);
$bigramString
=
$previousLetter
.
$wordLetter
;
$wordStarts
=
null
;
$bigrams
[
$bigramString
]
=
!
array_key_exists
(
$bigramString
,
$bigrams
)
?
1
:
$bigrams
[
$bigramString
]
+
1
;
$wordStarts
=
[];
$this
->
flushAndFreeMemory
();
$flushCpt
=
1
;
$roots
=
null
;
$roots
=
[];
$languageId
=
$specs
[
"language_id"
];
$language
=
$this
->
em
->
getRepository
(
Language
::
class
)
->
find
(
$languageId
);
$percent
=
round
(
$cpt
/
$total
*
100
,
2
);
echo
(
"["
.
$percent
.
"%] "
.
$wordValue
.
"
\n
"
);
$event
=
$stopwatch
->
stop
(
$stopwatchName
);
$stopwatchName
=
uniqid
();
echo
"max memory > "
.
$event
->
getMemory
()
/
1048576
.
" MB
\n
"
;
echo
"duration > "
.
$event
->
getDuration
()
/
1000
.
" seconds
\n\n
"
;
$stopwatch
->
start
(
$stopwatchName
);
}
}
$previousLetter
=
$wordLetter
;
}
if
(
$flushCpt
==
$maxToFlush
)
{
$this
->
wm
->
createStarts
(
$language
,
$wordStarts
);
$wordStarts
=
null
;
$wordStarts
=
[];
$this
->
flushAndFreeMemory
();
$flushCpt
=
1
;
$roots
=
null
;
$roots
=
[];
$languageId
=
$specs
[
"language_id"
];
$language
=
$this
->
em
->
getRepository
(
Language
::
class
)
->
find
(
$languageId
);
$percent
=
round
(
$cpt
/
$total
*
100
,
2
);
echo
(
"["
.
$percent
.
"%] "
.
$wordValue
.
"
\n
"
);
$event
=
$stopwatch
->
stop
(
$stopwatchName
);
$stopwatchName
=
uniqid
();
echo
"max memory > "
.
$event
->
getMemory
()
/
1048576
.
" MB
\n
"
;
echo
"duration > "
.
$event
->
getDuration
()
/
1000
.
" seconds
\n\n
"
;
$stopwatch
->
start
(
$stopwatchName
);
}
}
}
}
$cpt
++
;
$flushCpt
++
;
}
}
$cpt
++
;
fclose
(
$handle
);
$flushCpt
++
;
}
}
$this
->
bgm
->
generateBigrams
(
$bigrams
,
$pathLexiconDir
);
$this
->
bgm
->
generateBigrams
(
$bigrams
,
$pathLexiconDir
);
echo
(
"Bigram OK
\n
"
);
echo
(
"Bigram OK
\n
"
);
$this
->
wm
->
createStarts
(
$language
,
$wordStarts
);
$this
->
wm
->
createStarts
(
$language
,
$wordStarts
);
...
...
application/src/LexiconBundle/Manager/WordManager.php
View file @
b8a23d22
...
@@ -49,8 +49,6 @@ class WordManager
...
@@ -49,8 +49,6 @@ class WordManager
return
$word
;
return
$word
;
}
}
public
function
createStarts
(
Language
$language
,
$wordStarts
)
public
function
createStarts
(
Language
$language
,
$wordStarts
)
{
{
$languageId
=
$language
->
getId
();
$languageId
=
$language
->
getId
();
...
@@ -66,7 +64,6 @@ class WordManager
...
@@ -66,7 +64,6 @@ class WordManager
return
;
return
;
}
}
public
function
recalculate
(
Word
$word
)
public
function
recalculate
(
Word
$word
)
{
{
$language
=
$word
->
getLanguage
();
$language
=
$word
->
getLanguage
();
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment