Commit 7e0b50d2 authored by Sylvain Coulange's avatar Sylvain Coulange
Browse files

Grosse mise à jour anglais

parent 0ff1c92b
......@@ -8,7 +8,7 @@ import subprocess
import re
nlpFr = spacy.load('fr')
#nlpEn = spacy.load('en')
nlpEn = spacy.load("en_core_web_sm")
logFile = "../logs/dico_frwiktionary-20200301_v2.log"
def redirApp(request):
......@@ -23,11 +23,12 @@ def colorize(request):
colis = json.loads(request.body)
text = colis['inText']
lang = colis['lang']
nlpText = nlpFr(text)
if lang == "fr":
nlpText = nlpFr(text)
elif lang == "en":
nlpText = nlpEn(text)
outText = []
#punctuation = [',','.'] # ')',']','}','%','>',"'",'(','[','{'
for token in nlpText:
sdl = re.findall(r'\r\n',token.text)
print("sdl =",sdl)
......@@ -37,18 +38,21 @@ def colorize(request):
outText.append('§')
else:
print("Mot en entrée :",token.text)
result = txtphono.mimi(token.text)
if lang == "en":
result = txtphono.mimi(token.text,lang)
else:
result = txtphono.mimi(token.text,lang)
print(result)
phonographieList = []
for r in result:
phonographie = []
for i in r:
for i in r[0]:
ph = {}
ph['phon'] = i[0]
ph['graph'] = i[1]
phonographie.append(ph)
phonographieList.append(phonographie)
phonographieList.append((phonographie,r[1],r[2],r[3]))
outText.append(phonographieList)
rep = {
'outText': outText
......
......@@ -143,10 +143,12 @@ Install spaCy :
pip install -U spaCy
python -m spacy download fr
python -m spacy download fr_core_news_md
python -m spacy download en_core_web_sm
Use spaCy :
import spacy
from spacy import displacy
nlp = spacy.load('fr')
nlpEn = spacy.load("en_core_web_sm")
doc = nlp('Demain je travaille à la maison')
for token in doc:
print(token.text)
......
var loc2stand = {
"UK":"UK",
"RP":"UK",
"Scotland":"UK",
"Ireland":"UK",
"Northern England":"UK",
"British":"UK",
"Received Pronunciation":"UK",
"Wales":"UK",
"Conservative RP":"UK",
"North England":"UK",
"Geordie":"UK",
"Estuary English":"UK",
"Cockney":"UK",
"Welsh English":"UK",
"England":"UK",
"Southern England":"UK",
"Welsh":"UK",
"Irish":"UK",
"English":"UK",
"[[RP]]":"UK",
"[[w:Received Pronunciation":"UK",
"RP]]":"UK",
"some Northern English accents":"UK",
"Midlands":"UK",
"Scottish":"UK",
"Northern Ireland":"UK",
"UK, dated":"UK",
"UK, US":"UK",
"Southern England, Australia":"UK",
"Northern England, Scotland":"UK",
"Scots":"UK",
"Northern English":"UK",
"South Wales":"UK",
"RP dated":"UK",
"Northumberland":"UK",
"English Midlands":"UK",
"London":"UK",
"Bristolian":"UK",
"Southeast England":"UK",
"Southwest England":"UK",
"Cavan":"UK",
"Donegal":"UK",
"Monaghan":"UK",
"Leinster":"UK",
"Connacht":"UK",
"Munster":"UK",
"Yorkshire":"UK",
"US":"US",
"GA":"US",
"GenAm":"US",
"NYC":"US",
"General American":"US",
"Southern US":"US",
"New England":"US",
"New York":"US",
"Boston":"US",
"regional US":"US",
"GenAM":"US",
"Southern American English":"US",
"Northern US":"US",
"NY":"US",
"Midwestern US":"US",
"North America":"US",
"Michigan":"US",
"New York City":"US",
"Philadelphia":"US",
"[[w:General American":"US",
"GenAm]]":"US",
"Appalachia":"US",
"North American also":"US",
"Oregon":"US",
"northern US":"US",
"southern US":"US",
"North American":"US",
"UK, US":"US",
"Eastern New England":"US",
"parts of the US and Canada":"US",
"Southern US folk speech":"US",
"parts of the US and Canada":"US",
"some US":"US",
"Western US":"US",
"Appalachian":"US",
"CA]]; US, in accents with the [[cot-caught merger]]":"US",
"California":"US",
"Norfolk":"US",
"US, UK":"US",
"Western Pennsylvania":"US",
"New Hampshire":"US",
"Canada":"CA",
"CA":"CA",
"Canadian":"CA",
"parts of the US and Canada":"CA",
"parts of the US and Canada":"CA",
"[[w:Canadian English":"CA",
"CA]]; US, in accents with the [[cot-caught merger]]":"CA",
"AU":"AU",
"Australia":"AU",
"AusE":"AU",
"Aus":"AU",
"AuE":"AU",
"General Australian":"AU",
"Tasmanian":"AU",
"Australian":"AU",
"[[w:Australian English phonology":"AU",
"Southern England, Australia":"AU",
"AusE]]":"AU",
"AUS":"AU",
"GenAus":"AU",
"NZ":"NZ",
"New Zealand":"NZ",
"GNZ":"NZ",
"General New Zealand":"NZ",
"Singapore":"XX",
"South Africa":"XX",
"Singapore English":"XX",
"Philippine":"XX",
"India":"XX",
"Malaysia":"XX",
"Hong Kong":"XX",
"Singapore]]":"XX",
"South African":"XX",
"Nigeria":"XX",
"Malaysian English":"XX",
"[[w:Singapore English":"XX",
"Singaporean English":"XX",
"[[Singapore English]]":"XX",
"InE":"XX",
"MLE":"XX",
"Northern Cities Vowel Shift":"XX",
"West Country":"XX",
"IE":"XX",
"anglicised":"XX",
"northern cities vowel shift":"XX",
"anglicized":"XX",
"Anglicized":"XX",
"SAE":"XX",
"Spanish":"XX",
"imitating the French pronunciation":"XX",
"St. Louis":"XX",
"Commonwealth":"XX",
"Mid-Atlantic":"XX",
"[[w:Malaysian English":"XX",
"Malaysia]]":"XX",
"[[Cockney]]":"XX",
"Northern and Western":"XX",
"Southern":"XX",
"S Africa":"XX",
"[[w:Singapore":"XX"
}
\ No newline at end of file
......@@ -4,6 +4,13 @@ $(document).ready(function(){
txtarea.focus();
});
function selectLang(lang){
if (lang=="en"){
window.alert('English version still experimental :)');
}
}
var dicoTok = {};
var dicoId = {};
......@@ -44,52 +51,81 @@ async function getColorisation() {
for (i=0; i<outText.length; i++) {
// REMPLISSAGE DES ZONES MOTS DANS DIV RÉSULTATS
if (outText[i][0][0].graph.match(/\n+/)){
for (h=0; h<outText[i][0][0].graph.length; h++) {
if (outText[i][0][0][0].graph.match(/\n+/)){
for (h=0; h<outText[i][0][0][0].graph.length; h++) {
outputDiv.innerHTML = outputDiv.innerHTML + '<br>';
}
} else if (outText[i][0][0].graph.match(/^(,|\.|…|\)|\]|\}|%|>|»|”|-)$/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="phon_neutre">'+ outText[i][0][0].graph +'</span>';
if (outText[i][0][0].graph.match(/^-$/)) noSpace = true; else noSpace = false;
} else if (outText[i][0][0].graph.match(/\(|\[|\{|<|«|“/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="phon_neutre">'+ outText[i][0][0].graph +'</span>';
} else if (outText[i][0][0][0].graph.match(/^(,|\.|…|\)|\]|\}|%|>|»|”|-)$/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
if (outText[i][0][0][0].graph.match(/^-$/)) noSpace = true; else noSpace = false;
} else if (outText[i][0][0][0].graph.match(/\(|\[|\{|<|«|“/)) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="phon_neutre">'+ outText[i][0][0][0].graph +'</span>';
noSpace = true;
// } else if (outText[i][0][0].graph.match(/.*’/)) {
// outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="phon_neutre">'+ outText[i][0][0].graph +'</span>';
// noSpace = true;
} else if (noSpace) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="changeAlign(this.id)"></span>';
if (outText[i][0][0].graph.match(/.*’/)) noSpace = true; else noSpace = false;
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'"></span>';
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'" onclick="changeAlign(this.id)"></span>';
if (outText[i][0][0].graph.match(/.*’/)) noSpace = true; else noSpace = false;
if (outText[i][0][0][0].graph.match(/^’.+$/)) { // I'll you're
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span class="tokens" id="tok'+ i +'"></span>';
}
} else {
if (outText[i].length>1) {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'" onclick="showAlignPop(this.id)"></span>';
} else {
outputDiv.innerHTML = outputDiv.innerHTML + '<span> </span><span class="tokens" id="tok'+ i +'"></span>';
}
}
if (outText[i][0][0][0].graph.match(/^.+’$/)) noSpace = true; else noSpace = false;
}
// FORMATAGE DES SPANS
if (outText[i][0][0].graph != '\n'){
if (outText[i][0][0][0].graph != '\n'){
dicoTok['tok'+i] = [];
waitinglist = []; // we will put all non aligned ones here, waiting to pushing them all at the end (so that they can't appear first)
for (j=0; j<outText[i].length; j++) {
var newWord = '';
for (k=0; k<outText[i][j].length; k++) {
//console.log(outText[i][j][k]);
newWord = newWord + '<span class="phon '+ outText[i][j][k].phon +'">'+ outText[i][j][k].graph +'</span>';
for (k=0; k<outText[i][j][0].length; k++) {
newWord = newWord + '<span class="phon '+ outText[i][j][0][k].phon +'">'+ outText[i][j][0][k].graph +'</span>';
}
if (outText[i][j][3].length > 0) {
waitinglist.push([newWord,outText[i][j][1],outText[i][j][2],outText[i][j][3]]);
} else {
dicoTok['tok'+i].push([newWord,outText[i][j][1],outText[i][j][2],outText[i][j][3]]);
}
dicoTok['tok'+i].push(newWord);
}
for (l=0; l<waitinglist.length; l++){
dicoTok['tok'+i].push(waitinglist[l]);
}
}
// INITIALISATION DU dicoId
if (outText[i][0][0].graph != '\n'){
if (outText[i][0][0][0].graph != '\n'){
dicoId['tok'+i] = 0;
}
}
var tokens = document.getElementsByClassName("tokens");
for (i = 0; i < tokens.length; i++) {
tokens[i].innerHTML = dicoTok[tokens[i].id][0];
if (dicoTok[tokens[i].id].length > 1) {
// dicoTok[tokens[i].id] = dicoTok[tokens[i].id].sort(function(a, b){
// // ASC -> a.length - b.length
// // DESC -> b.length - a.length
// return b.length - a.length;
// });
tokens[i].innerHTML = dicoTok[tokens[i].id][0][0];
$('#'+tokens[i].id).addClass('transMult');
$('#'+tokens[i].id).prop('title', 'changer de prononciation');
} else {
tokens[i].innerHTML = dicoTok[tokens[i].id][0][0];
};
};
unknownMark();
......@@ -109,7 +145,8 @@ function unknownMark() {
var marks = document.getElementsByClassName('unknownMark');
var i;
for (i=0; i < marks.length; i++) {
marks[i].parentNode.removeChild(marks[i]);
//marks[i].parentNode.removeChild(marks[i]);
marks[i].style.display = "none";
}
var unknowns = document.getElementsByClassName('phon_inconnu');
......@@ -140,7 +177,11 @@ function dicoAddPop(mot) {
document.getElementById('popAddwValid').addEventListener('click', function(){
var trans = document.getElementById('popAddwTrans').value;
if (trans.length > 0) {
validEntry(mot, trans);
if (document.getElementById('choixLang').value == "fr") {
validEntry(mot, trans);
} else {
window.alert("Fonction désactivée pour l'anglais pour l'instant");
}
}
});
}
......@@ -178,14 +219,18 @@ function bugMark() {
var marks = document.getElementsByClassName('bugMark');
var i;
for (i=0; i < marks.length; i++) {
marks[i].parentNode.removeChild(marks[i]);
//marks[i].parentNode.removeChild(marks[i]);
marks[i].style.display = "none";
}
var bugs = document.getElementsByClassName('phon_echec');
var i;
for (i = 0; i < bugs.length; i++) {
var newMark = document.createElement('span');
newMark.classList = 'glyphicon glyphicon-flash bugMark';
newMark.title = "Problème de colorisation ! L'erreur a été signalée.";
var thisMotId = bugs[i].parentNode.id;
console.log(thisMotId);
var infoMot = dicoTok[thisMotId][dicoId[thisMotId]];
newMark.title = '/' + infoMot[2] + '/: ' + infoMot[3];
bugs[i].parentNode.insertBefore(newMark,bugs[i].nextSibling);
};
}
......@@ -197,24 +242,61 @@ if (document.getElementById('bicolor').checked == true) {
toBicolor();
};
function changeAlign(tok) {
//console.log(tok);
if (dicoId[tok] == dicoTok[tok].length-1) {
dicoId[tok] = 0;
} else {
dicoId[tok] += 1;
};
//console.log(dicoId[tok]);
document.getElementById(tok).innerHTML = dicoTok[tok][dicoId[tok]];
function showAlignPop(tok) {
var alignPopDivBack = document.getElementById('alignPopDivBack');
alignPopDivBack.style.display = "block";
var alignPopDiv = document.createElement('div');
var el = document.getElementById(tok);
alignPopDiv.classList = "alignPopDiv";
alignPopDiv.id = "alignPopDiv";
alignPopDiv.style.top = el.offsetTop + el.offsetHeight +"px";
alignPopDiv.style.left = el.offsetLeft - 20 +"px";
alignPopDiv.style.minWidth = el.offsetWidth + "px";
el.appendChild(alignPopDiv);
var tab = '<table class="tableAlign">';
for (i=0; i<dicoTok[tok].length; i++) {
var stand = "";
if (loc2stand[dicoTok[tok][i][1]] == "UK"){
stand = "static/im/flag-uk.png";
} else if (loc2stand[dicoTok[tok][i][1]] == "US"){
stand = "static/im/flag-us.png";
} else if (loc2stand[dicoTok[tok][i][1]] == "AU"){
stand = "static/im/flag-au.png";
} else if (loc2stand[dicoTok[tok][i][1]] == "CA"){
stand = "static/im/flag-ca.png";
} else if (loc2stand[dicoTok[tok][i][1]] == "NZ"){
stand = "static/im/flag-nz.png";
}
tab = tab + "<tr onmousedown='selectAlign(\""+ tok + "\"," + i +")' onmouseup='supprAlignPopDiv();' title='"+dicoTok[tok][i][1]+"'><td><img style='width:35px' src='"+ stand +"'/></td><td>"+dicoTok[tok][i][0]+"</td></tr>";
}
tab = tab + "</table>";
alignPopDiv.innerHTML = tab;
window.onclick = function(event) {
if (event.target == document.getElementById('alignPopDivBack')) {
document.getElementById('alignPopDiv').parentElement.removeChild(document.getElementById('alignPopDiv'));
document.getElementById('alignPopDivBack').style.display = "none";
}
}
}
function selectAlign(tok,i){
dicoId[tok] = i;
document.getElementById('alignPopDivBack').style.display = "none";
document.getElementById(tok).innerHTML = dicoTok[tok][i][0];
//réinitialise le bicolor (si bicolor)
if (document.getElementById('bicolor').checked == true) {
toBicolor();
toBicolor(); // réinitialisation du bicolor (nécessaire si nb phonèmes différents)
};
unknownMark();
bugMark();
};
function supprAlignPopDiv() {
document.getElementById('alignPopDiv').parentElement.removeChild(document.getElementById('alignPopDiv'));
}
// POUR PASSER EN BICOLOR (se fait à partir du résultat aux couleurs SW)
// on ajoute la classe .bicolor1 ou .bicolor2 alternativement, si la graphie n'est pas .phon_neutre
......@@ -358,7 +440,7 @@ span.onclick = function() {
window.onclick = function(event) {
if (event.target == modal) {
modal.style.display = "none";
}
}
}
// COPIE AUTO DU LIEN DANS PRESSE-PAPIER
......
......@@ -26,28 +26,31 @@ locale.setlocale(locale.LC_ALL, "")
from sys import path as pylib #im naming it as pylib so that we won't get confused between os.path and sys.path
pylib += [os.path.relpath(r'../phon2graph')]
from phon2graph import decoupage
from phon2graph_english import decoupageEn # ENGLISH
# FICHIERS
phonColFile = "../phon2graph/data/class2api.json"
phonColFile = "../phon2graph/data/api2class.json"
phonGraphFile = "../phon2graph/data/fidel_wikicolor.scsv" # "../phon2graph/data/phoneme-graphies_fr.scsv"
phonGraphFileEn = "../phon2graph/data/fidel_wikicolor_en.scsv" # ENGLISH
dicFile = "../wikiphon/dico_frwiktionary-20200301_v2.json"
dicFileEn = "../wikiphon/dico_enWiktionary-20200704_v1.json" # ENGLISH
logFile = "../logs/dico_frwiktionary-20200301_v2.log"
logFileEn = "../logs/dico_enWiktionary-20200704_v1.log" # ENGLISH
logBugFile = "../logs/wikicolor-bug.log"
logBugFileEn = "../logs/wikicolorEn-bug.log" # ENGLISH
# LECTURE DU CODE PHONEME-COULEUR
with open(phonColFile,"r") as phonFile:
class2phon = json.load(phonFile)
phon2class = json.load(phonFile)
# Inversion du dictionnaire class2phon
phon2class = {}
for cl,ph in class2phon.items():
phon2class[ph] = cl
# Ajout de l'équivalent ɲ à nj
phon2class['ɲ'] = "phon_nj"
# LECTURE DE LA LISTE PHONEME-GRAPHIES (FIDEL)
phonFile = open(phonGraphFile,mode="r")
phon2graph = {}
phon2graphFr = {}
phonCpt = 0
graphCpt = 0
......@@ -56,29 +59,27 @@ for line in phonFile:
line = line.strip()
l= line.split(':')
phon2graph[l[0]] = []
phon2graphFr[l[0]] = []
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graph[l[0]].append(graph.replace("'","’"))
phon2graphFr[l[0]].append(graph.replace("'","’"))
graphCpt+=1
#print(l[0],phon2graph[l[0]])
phonFile.close()
# LECTURE DU DICTIONNAIRE
word2trans = {} # un mot → liste de trans possibles
word2transFr = {} # un mot → liste de trans possibles
with open(dicFile, 'r') as f:
word2trans = json.load(f)
word2transFr = json.load(f)
lenDic = 0
for k in word2trans.keys():
for k in word2transFr.keys():
lenDic+=1
print('len frwiki :',lenDic)
def getLenDic():
lenDic = 0
for k in word2trans.keys():
for k in word2transFr.keys():
lenDic+=1
return lenDic
......@@ -110,7 +111,80 @@ with open(logBugFile, 'r') as logf:
logBug = json.load(logf)
print("Nombre de bug d'alignement enregistrés :",len(logBug))
def mimi(mot):
#############################
##### FICHIERS POUR L'ANGLAIS
# LECTURE DE LA LISTE PHONEME-GRAPHIES (FIDEL) --ENGLISH--
phonFileEn = open(phonGraphFileEn,mode="r")
phon2graphEn = {}
phonCptEn = 0
graphCptEn = 0
for line in phonFileEn:
phonCptEn+=1
line = line.strip()
l= line.split(':')
phon2graphEn[l[0]] = []
listegraphies = l[1].split(',')
for graph in listegraphies:
phon2graphEn[l[0]].append(graph.replace("'","’"))
graphCptEn+=1
phonFileEn.close()
# LECTURE DU DICTIONNAIRE --ENGLISH--
word2transEn = {} # un mot → liste de trans possibles
with open(dicFileEn, 'r') as f:
word2transEn = json.load(f)
lenDicEn = 0
for k in word2transEn.keys():
lenDicEn+=1
print('len enwiki :',lenDicEn)
def getLenDicEn():
lenDicEn = 0
for k in word2transEn.keys():
lenDicEn+=1
return lenDicEn
# LECTURE DES LOG --ENGLISH--