Commit e4efef24 authored by hatiers's avatar hatiers

alter table et db utf8 + import japonais

parent 4b3e434a
......@@ -9,3 +9,12 @@ magicword
- vérifier dans interface adminer, dans /variables, valeur token et stopword
- Pour import et suppression de lexiques -> le faire en commande car long
+# Command Adminer
+
+ALTER DATABASE dbmw CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+ALTER TABLE lexicon_word CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+ALTER TABLE lexicon_word_start CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+ALTER TABLE lexicon_root CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+ALTER TABLE lexicon_feature CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
+ALTER TABLE lexicon_letter CONVERT TO CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_ci;
......@@ -69,7 +69,11 @@ doctrine:
dbname: "%database_name%"
user: "%database_user%"
password: "%database_password%"
charset: UTF8
charset: utf8mb4
default_table_options:
charset: utf8mb4
collate: utf8mb4_unicode_ci
logging: '%kernel.debug%'
profiling: '%kernel.debug%'
......
......@@ -152,6 +152,7 @@ class ImportManager
$handle = @fopen($pathFileLexicon, "r");
if ($handle) {
while (($line = fgets($handle, 4096)) !== false) {
$line = iconv("UTF-8", "UTF-8//IGNORE", $line);
if (preg_match_all("/^([^\t]+)\t([^\t]+)\t([^\t]+)\s*$/", $line, $matches)) {
$uid1 = uniqid();
$uid2 = uniqid();
......
#script prenant en entrée les ressources suivantes :
#le fichier xml fileLexiconIn
#en sortie -> lexicon.tsv ($fileLexiconOut)
use Encode;
use IO::Handle;
STDOUT->autoflush();
use strict;
use utf8;
use Data::Dumper;
################################## VARIABLE LEXIQUE USER ########################################
my $userLanguage="frenchM2";
################################## VARIABLE LEXIQUE USER ########################################
##################################DEBUT STRUCTURE DE DONNEES ########################################
#######CHEMIN RESSOURCES
my $fileLexiconIn="japonais.tsv";
# my $fileLexiconIn="extrait.xml";
my $fileLexiconOut="lexicon-japan.tsv";
##################################FIN STRUCTURE DE DONNEES ########################################
#Début lecture
my $line;
################################## DEBUT LECTURE LEXIQUE ENTRIES #######################################
open(LEXICONTSV,">:encoding(utf8)",$fileLexiconOut);
open(LEXICON,"<:encoding(utf8)",$fileLexiconIn);
# while ( ($line = <LEXICON>) && ($cptID <= 1000) ) {
while ( ($line = <LEXICON>) ) {
if($line=~/^([^\t]+)\t([^\t]+)\t([^\t]+)$/){
my ($form,$combo,$feats) = ($1,$2,$3);
# if($feats=~/^(\S+) (.+)$/){
# $feats = "pos=".$2;
# }
print LEXICONTSV $form."\t".$combo."\tpos=".$feats;
}
else{
print "Ligne chelou $line";
}
}
close(LEXICON);
################################## FIN LECTURE LEXIQUE ENTRIES #######################################
close(LEXICONTSV);
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment