Vous avez reçu un message "Your GitLab account has been locked ..." ? Pas d'inquiétude : lisez cet article https://docs.gricad-pages.univ-grenoble-alpes.fr/help/unlock/

Commit b44ad433 authored by Sylvain Coulange's avatar Sylvain Coulange
Browse files

Ajout mandarin

parent 3ebe17b4
asgiref==3.2.10
blis==0.4.1
catalogue==0.0.8
certifi==2019.11.28
catalogue==1.0.0
certifi==2020.4.5.1
chardet==3.0.4
cymem==2.0.3
Django==2.1
fr-core-news-md==2.2.5
fr-core-news-sm==2.2.5
idna==2.8
importlib-metadata==1.3.0
more-itertools==8.0.2
Cython==0.29.21
Django==3.1.2
django-cors-headers==3.5.0
dnspython==2.0.0
en-core-web-md @ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.1/en_core_web_md-2.3.1.tar.gz
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
fr-core-news-md @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_md-2.3.0/fr_core_news_md-2.3.0.tar.gz
fr-core-news-sm @ https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-2.2.5/fr_core_news_sm-2.2.5.tar.gz
idna==2.9
importlib-metadata==1.6.0
jieba==0.42.1
murmurhash==1.0.2
numpy==1.18.0
numpy==1.18.4
pinyin==0.4.0
pkuseg==0.0.25
plac==1.1.3
preshed==3.0.2
pymongo==3.11.0
pytz==2019.3
requests==2.22.0
spacy==2.2.3
srsly==0.2.0
thinc==7.3.1
tqdm==4.41.0
urllib3==1.25.7
wasabi==0.4.2
zipp==0.6.0
requests==2.23.0
spacy==2.3.4
sqlparse==0.4.1
srsly==1.0.2
thinc==7.4.3
tqdm==4.46.0
urllib3==1.25.9
wasabi==0.6.0
zh-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/zh_core_web_sm-2.3.1/zh_core_web_sm-2.3.1.tar.gz
zipp==3.1.0
# -*- encoding:utf8 -*-
######### phon2graph.py #########
#
# Version Python 3.7.1
#
# Prend en entrée :
# - dictionnaire phonème→couleur (phoneme-couleur.csv)
# - dictionnaire phonème→graphies (phoneme-graphies_fr.scsv)
# - dictionnaire phonétisé (dico_frwiki.csv)
#
# cf. README pour explications détaillées
import re
import sys
import os
import datetime
import json
import re, sys, os, datetime, json, tempfile, csv, locale, pinyin
from colorapp.models import Entree, DicEntry, LogStat
import tempfile
import csv
from collections import OrderedDict
import locale
locale.setlocale(locale.LC_ALL, "")
from sys import path as pylib #im naming it as pylib so that we won't get confused between os.path and sys.path
pylib += [os.path.relpath(r'../phon2graph')]
from phon2graph import decoupage
from phon2graph_french import decoupage
from phon2graph_english import decoupageEn # ENGLISH
from phon2graph_mandarin import pinyin2phon # MANDARIN CHINESE
# FICHIERS
phonColFile = "../phon2graph/data/api2class.json"
phonGraphFile = "../phon2graph/data/fidel_wikicolor.scsv" # "../phon2graph/data/phoneme-graphies_fr.scsv"
phonGraphFileEn = "../phon2graph/data/fidel_wikicolor_en_global.scsv" # ENGLISH
pinyin2apiFile = "../phon2graph/data/pinyin2api.json" # MANDARIN
api2classFile = "../phon2graph/data/api2class.json" # MANDARIN
dicFile = "../wikiphon/dico_frwiktionary-20200301_v2.json"
#dicFileEn = "../wikiphon/dico_enWiktionary-20200704_v1.json" # ENGLISH
dicFileEn = "../wikiphon/dico_enWikiCmu.json"
dicFileZh = "../wikiphon/dico_zhCCDict_20201206_v1.json"
logFile = "../logs/dico_frwiktionary-20200301_v2.log"
logFileEn = "../logs/dico_enWiktionary-20200704_v1.log" # ENGLISH
......@@ -44,47 +29,84 @@ logFileEn = "../logs/dico_enWiktionary-20200704_v1.log" # ENGLISH
logBugFile = "../logs/wikicolor-bug.log"
logBugFileEn = "../logs/wikicolorEn-bug.log" # ENGLISH
# LECTURE DU CODE PHONEME-COULEUR
########################################
######### LECTURE DES FICHIERS #########
########################################
##### LECTURE DU CODE PHONEME-COULEUR #####
with open(phonColFile,"r") as phonFile:
phon2class = json.load(phonFile)
# LECTURE DE LA LISTE PHONEME-GRAPHIES (FIDEL)
phonFile = open(phonGraphFile,mode="r")
phon2graphFr = {}
phonCpt = 0
graphCpt = 0
##### LECTURE DES DICTIONNAIRES #####
def getLenDic(dic):
lenDic = 0
for k in dic.keys():
lenDic+=1
return lenDic
for line in phonFile:
phonCpt+=1
line = line.strip()
l= line.split(':')
## FR
word2transFr = {} # un mot → liste de trans possibles
with open(dicFile, 'r') as f:
word2transFr = json.load(f)
print("Nombre d'entrées dans le dictionnaire de français :", getLenDic(word2transFr))
phon2graphFr[l[0]] = []
## EN
word2transEn = {} # un mot → liste de trans possibles
with open(dicFileEn, 'r') as f:
word2transEn = json.load(f)
print("Nombre d'entrées dans le dictionnaire d'anglais :", getLenDic(word2transEn))
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graphFr[l[0]].append(graph.replace("'","’"))
graphCpt+=1
## ZH
word2transZh = {} # un mot → une transcription pinyin
with open(dicFileZh, 'r') as f:
word2transZh = json.load(f)
print("Nombre d'entrées dans le dictionnaire de mandarin :", getLenDic(word2transZh))
phonFile.close()
##### LECTURE DES LISTES PHONEME-GRAPHIES (FIDEL) #####
# LECTURE DU DICTIONNAIRE
word2transFr = {} # un mot → liste de trans possibles
with open(dicFile, 'r') as f:
word2transFr = json.load(f)
lenDic = 0
for k in word2transFr.keys():
lenDic+=1
print('len frwiki :',lenDic)
def getLenDic():
lenDic = 0
for k in word2transFr.keys():
lenDic+=1
return lenDic
## FR
with open(phonGraphFile,mode="r") as phonFile:
phon2graphFr = {}
phonCpt = 0
graphCpt = 0
for line in phonFile:
phonCpt+=1
line = line.strip()
l= line.split(':')
phon2graphFr[l[0]] = []
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graphFr[l[0]].append(graph.replace("'","’"))
graphCpt+=1
## EN
with open(phonGraphFileEn,mode="r") as phonFileEn:
phon2graphEn = {}
phonCptEn = 0
graphCptEn = 0
for line in phonFileEn:
phonCptEn+=1
line = line.strip()
l= line.split(':')
phon2graphEn[l[0]] = []
# LECTURE DES LOG
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graphEn[l[0]].append(graph.replace("'","’"))
graphCptEn+=1
##### LECTURE DES LOGS #####
## FR
logDicFr = {}
with open(logFile, 'r') as logf:
logDicFr = json.load(logf)
......@@ -92,7 +114,7 @@ cptEdit = 0
for i,j in logDicFr.items():
for k in j:
cptEdit+=1
print("Nombre de modifications du dictionnaire :",cptEdit)
print("Nombre de modifications du dictionnaire de français :", cptEdit)
def getLogStat():
logStat = LogStat()
logStat.cptEdit = 0
......@@ -110,46 +132,9 @@ def getLogStat():
logBug = {}
with open(logBugFile, 'r') as logf:
logBug = json.load(logf)
print("Nombre de bug d'alignement enregistrés :",len(logBug))
#############################
##### FICHIERS POUR L'ANGLAIS
# LECTURE DE LA LISTE PHONEME-GRAPHIES (FIDEL) --ENGLISH--
phonFileEn = open(phonGraphFileEn,mode="r")
phon2graphEn = {}
phonCptEn = 0
graphCptEn = 0
for line in phonFileEn:
phonCptEn+=1
line = line.strip()
l= line.split(':')
print("Nombre de bug d'alignement enregistrés en français :", len(logBug))
phon2graphEn[l[0]] = []
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graphEn[l[0]].append(graph.replace("'","’"))
graphCptEn+=1
phonFileEn.close()
# LECTURE DU DICTIONNAIRE --ENGLISH--
word2transEn = {} # un mot → liste de trans possibles
with open(dicFileEn, 'r') as f:
word2transEn = json.load(f)
lenDicEn = 0
for k in word2transEn.keys():
lenDicEn+=1
print('len enwiki :',lenDicEn)
def getLenDicEn():
lenDicEn = 0
for k in word2transEn.keys():
lenDicEn+=1
return lenDicEn
# LECTURE DES LOG --ENGLISH--
## EN
logDicEn = {}
with open(logFileEn, 'r') as logf:
logDicEn = json.load(logf)
......@@ -157,7 +142,7 @@ cptEditEn = 0
for i,j in logDicEn.items():
for k in j:
cptEditEn+=1
print("(ENGLISH) Nombre de modifications du dictionnaire :",cptEditEn)
print("Nombre de modifications du dictionnaire d'anglais :", cptEditEn)
def getLogStatEn():
logStatEn = LogStat()
logStatEn.cptEdit = 0
......@@ -175,10 +160,22 @@ def getLogStatEn():
logBugEn = {}
with open(logBugFileEn, 'r') as logf:
logBugEn = json.load(logf)
print("(ENGLISH) Nombre de bug d'alignement enregistrés :",len(logBugEn))
print("Nombre de bug d'alignement enregistrés en anglais :", len(logBugEn))
def mimi(mot, lang):
##### SUPPLEMENTS POUR MANDARIN #####
with open(pinyin2apiFile) as inFile:
pinyin2api = json.load(inFile)
with open(api2classFile) as inFile:
api2class = json.load(inFile)
########################################
######### LISTE DES FONCTIONS ##########
########################################
def traitement(mot, lang):
if lang == "fr":
word2trans = word2transFr
phon2graph = phon2graphFr
......@@ -247,9 +244,36 @@ def mimi(mot, lang):
cased += l.upper() if cptlettre == m else l
cptlettre += 1
r[0][k] = (tupl[0],cased)
return result
def traitementzh(mot):
result = [] # liste type : [[car, api, phonlist, ton], [car, api, phonlist, ton]...]
if mot in word2transZh.keys():
pinyinOutput = word2transZh[mot]["p"].lower()
print("Mot trouvé dans le dictionnaire :", mot, pinyinOutput)
else:
print("Mot non trouvé dans le dictionnaire!")
pinyinOutput = pinyin.get(mot, format="numerical", delimiter=" ")
print("Translittération automatique :", pinyinOutput) # ni3 hao3
pinparse = pinyinOutput.split(' ') # ['ni3', 'hao3']
for hanzindex, pintone in enumerate(pinparse):
if pintone[-1] in ['1', '2', '3', '4', '5'] and len(pintone)>1: # Si il y a un ton et que ce n'est pas qu'un chiffre, c'est que la pinyinisation a fonctionné
res = [mot[hanzindex]]
for el in pinyin2phon(pintone, pinyin2api, api2class):
res.append(el)
result.append(res)
else:
result.append((mot[hanzindex], "", [], 0))
print(result)
return result
def getEntryByWord(m,mc,t,tc,lang):
# m = mot (contenu de la barre de recherche "mot"),
# mc = motCond (condition de recherche : contient, est égal à, commence par, finit par),
......
......@@ -7,8 +7,9 @@ import spacy
import subprocess
import re
nlpFr = spacy.load('fr')
nlpFr = spacy.load('fr_core_news_md')
nlpEn = spacy.load("en_core_web_sm")
nlpZh = spacy.load("zh_core_web_sm")
logFile = "../logs/dico_frwiktionary-20200301_v2.log"
def redirApp(request):
......@@ -27,6 +28,8 @@ def colorize(request):
nlpText = nlpFr(text)
elif lang == "en":
nlpText = nlpEn(text)
elif lang == "zh":
nlpText = nlpZh(text)
outText = []
for token in nlpText:
......@@ -38,22 +41,25 @@ def colorize(request):
outText.append('§')
else:
print("Mot en entrée :",token.text)
if lang == "en":
result = txtphono.mimi(token.text,lang)
else:
result = txtphono.mimi(token.text,lang)
print(result)
phonographieList = []
for r in result:
phonographie = []
for i in r[0]:
ph = {}
ph['phon'] = i[0]
ph['graph'] = i[1]
phonographie.append(ph)
phonographieList.append((phonographie,r[1],r[2],r[3]))
outText.append(phonographieList)
if lang == "fr" or lang == "en":
result = txtphono.traitement(token.text,lang)
phonographieList = []
for r in result:
phonographie = []
for i in r[0]:
ph = {}
ph['phon'] = i[0]
ph['graph'] = i[1]
phonographie.append(ph)
phonographieList.append((phonographie,r[1],r[2],r[3]))
outText.append(phonographieList)
elif lang == "zh":
result = txtphono.traitementzh(token.text)
outText.append(result)
print("Résultat en sortie :", result)
rep = {
'outText': outText
}
......
......@@ -24,6 +24,7 @@ urlpatterns = [
path('', colorapp_views.main),
path('en/', colorapp_views.main),
path('fr/', colorapp_views.main),
path('zh/', colorapp_views.main),
path('colorize/', csrf_exempt(colorapp_views.colorize)),
path('getPhonoOf/', csrf_exempt(colorapp_views.getPhonoOf)),
#path('getAllPhonographiesOf/', csrf_exempt(colorapp_views.getAllPhonographiesOf)),
......
......@@ -19,12 +19,13 @@ var thisPageLang = "";
// set page target language
setLangFromUrl()
function setLangFromUrl() {
var pageLang = thisURL.match(/.*\/(fr|en)/);
var pageLang = thisURL.match(/.*\/(fr|en|zh)/);
if (pageLang) {
console.log("Langue indiquée par l'url:",pageLang[1]);
thisPageLang = pageLang[1];
if (pageLang[1]=="fr") selectLang("fr");
if (pageLang[1]=="en") selectLang("en");
if (pageLang[1]=="en") selectLang("en");
if (pageLang[1]=="zh") selectLang("zh");
} else {
console.log("Chargement langue par défaut (fr)");
thisPageLang = "fr"
......@@ -41,9 +42,17 @@ function selectLang(lang){
document.getElementById('monochromeLabel').style.display="";
document.getElementById('monochrome').style.display="";
document.getElementById('silentWayLabel').style.margin = "-20px 0px";
document.getElementById('bicolor').style.display="";
document.getElementById('bicolorLabel').style.display="";
document.getElementById('bicolorLabel').style.margin = "-10px 0px";
document.getElementById('subtitle').style.marginTop = "-20px";
// Paramètres output
document.getElementById('ti_btnCopierColler').style.display = "";
document.getElementById('ti_btnBold').style.display = "";
document.getElementById('btnzi').style.display = "none";
document.getElementById('btnwordspace').style.display = "none";
window.history.pushState("", "", "/en");
} else if (lang=="fr"){
......@@ -54,10 +63,37 @@ function selectLang(lang){
document.getElementById('monochromeLabel').style.display="none";
document.getElementById('monochrome').style.display="none";
document.getElementById('silentWayLabel').style.margin = "0px 0px";
document.getElementById('bicolor').style.display="";
document.getElementById('bicolorLabel').style.display="";
document.getElementById('bicolorLabel').style.margin = "0px 0px";
document.getElementById('subtitle').style.marginTop = "0px";
// Paramètres output
document.getElementById('ti_btnCopierColler').style.display = "";
document.getElementById('ti_btnBold').style.display = "";
document.getElementById('btnzi').style.display = "none";
document.getElementById('btnwordspace').style.display = "none";
window.history.pushState("", "", "/fr");
} else if (lang == "zh"){
interface("zh");
document.getElementById('choixLang').value = 'zh';
if (boolBold == false) toggleBold();
document.getElementById('monochromeLabel').style.display="none";
document.getElementById('monochrome').style.display="none";
document.getElementById('silentWayLabel').style.margin = "0px 0px";
document.getElementById('bicolor').style.display="none";
document.getElementById('bicolorLabel').style.display="none";
document.getElementById('subtitle').style.marginTop = "0px";
// Paramètres output
document.getElementById('ti_btnCopierColler').style.display = "none";
document.getElementById('ti_btnBold').style.display = "none";
document.getElementById('btnzi').style.display = "";
document.getElementById('btnwordspace').style.display = "";
window.history.pushState("", "", "/zh");
}
}
......@@ -120,117 +156,165 @@ async function getColorisation() {
console.log(data);
// ÉCRITURE DU RÉSULTATS DANS DIV RÉSULTATS
var outputDiv = document.getElementById('output');
var outText = data['outText'];
outputDiv.innerHTML = "";
dicoTok = {};
dicoId = {};
var noSpace = false;
for (i=0; i<outText.length; i++) {
// REMPLISSAGE DES ZONES MOTS DANS DIV RÉSULTATS
if (outText[i][0][0] == []) {
// Sécurité: si phonigraphie est une liste vide, on tente quand même d'afficher du texte en gris
console.log("Bug outText[",i,"][0][0] is empty!");
// ici trouver code pour renvoyer txt brut
} else if (outText[i][0][0][0] == undefined){
console.log("undefined: index",i,outText[i][0][0])
} else {
if (outText[i][0][0][0].graph.match(/\n+/)){
for (h=0; h<outText[i][0][0][0].graph.length; h++) {
outputDiv.innerHTML = outputDiv.innerHTML + '<br>';
}
} else if (outText[i][0][0][0].graph.match(/^(,|\.|…|\)|\]|\}|%|>|»|”|-)$/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
if (outText[i][0][0][0].graph.match(/^-$/)) noSpace = true; else noSpace = false;
} else if (outText[i][0][0][0].graph.match(/\(|\[|\{|<|«|“/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
noSpace = true;
} else if (noSpace) {
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'"></span>';
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
if (lang == "fr" || lang == "en"){
document.getElementById('outputzh').innerHTML = "";
var outputDiv = document.getElementById('output');
var outText = data['outText'];
outputDiv.innerHTML = "";
dicoTok = {};
dicoId = {};
var noSpace = false;
for (i=0; i<outText.length; i++) {
// REMPLISSAGE DES ZONES MOTS DANS DIV RÉSULTATS
if (outText[i][0][0] == []) {
// Sécurité: si phonigraphie est une liste vide, on tente quand même d'afficher du texte en gris
console.log("Bug outText[",i,"][0][0] is empty!");
// ici trouver code pour renvoyer txt brut
} else if (outText[i][0][0][0] == undefined){
console.log("undefined: index",i,outText[i][0][0])
} else {
if (outText[i][0][0][0].graph.match(/^’.+$/)) { // I'll you're
if (outText[i][0][0][0].graph.match(/\n+/)){
for (h=0; h<outText[i][0][0][0].graph.length; h++) {
outputDiv.innerHTML = outputDiv.innerHTML + '<br>';
}
} else if (outText[i][0][0][0].graph.match(/^(,|\.|…|\)|\]|\}|%|>|»|”|-)$/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
if (outText[i][0][0][0].graph.match(/^-$/)) noSpace = true; else noSpace = false;
} else if (outText[i][0][0][0].graph.match(/\(|\[|\{|<|«|“/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
noSpace = true;
} else if (noSpace) {
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'"></span>';
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
} else {
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
if (outText[i][0][0][0].graph.match(/^’.+$/)) { // I'll you're
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'"></span>';
}
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'"></span>';
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'"></span>';
}
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
}
// FORMATAGE DES SPANS
if (outText[i][0][0][0].graph != '\n'){
dicoTok['tok'+i] = [];
waitinglist = []; // we will put all non aligned ones here, waiting to pushing them all at the end (so that they can't appear first)
for (j=0; j<outText[i].length; j++) {