The function split phrase into words (by space) and detect language of each word (by occurances of language's characters).
E.g. My home is 马来西亚
= 'en', 'en', 'en', 'zh', thus this phrase in English.
Currently it only supports English (all latin characters including German, Spainish, French, Indonesian, etc will be categorized as English), Chinse, Hindi, Arabic, Bengali, Hebrew.
function detectLanguage(text) { // split into words const langs = text.trim().split(/\s+/).map(word => { return detect(word) }) // pick the lang with the most occurances return (langs || []).reduce( ( acc, el ) => { acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1 acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el return acc }, { k:{} }).max function detect(text) { const scores = {} const regexes = { 'en': /[\u0000-\u007F]/gi, 'zh': /[\u3000\u3400-\u4DBF\u4E00-\u9FFF]/gi, 'hi': /[\u0900-\u097F]/gi, 'ar': /[\u0621-\u064A\u0660-\u0669]/gi, 'bn': /[\u0995-\u09B9\u09CE\u09DC-\u09DF\u0985-\u0994\u09BE-\u09CC\u09D7\u09BC]/gi, 'he': /[\u0590-\u05FF]/gi, } for (const [lang, regex] of Object.entries(regexes)) { // detect occurances of lang in a word let matches = text.match(regex) || [] let score = matches.length / text.length if (score) { // high percentage, return result if (score > 0.85) { return lang } scores[lang] = score } } // not detected if (Object.keys(scores).length == 0) return null // pick lang with highest percentage return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b); }}
Test
const values = [ 'I am a happy cow', // English 'Je suis une vache heureuse', // French 'Ich bin eine glückliche Kuh',// German 'Soy una vaca feliz', // Spanish 'Saya sapi yang bahagia', // Indonesian '我是一头快乐的牛', // Chinese Simplified '我是一頭快樂的牛', // Chinese Traditional '私は幸せな牛です', // Japanese 'मैं एक खुश गाय हूँ', // Hindi 'انا بقرة سعيدة', // Arabic 'Я счастливая корова' // Russian]const data = {}values.forEach(value => { data[value] = detectLanguage(value)})console.log(data)
Result:
- All latin characters all categorized as English
- Japanse is detected as Chinse (it contains 3 Chinese characters). Adding Japanse character detection would mitigate this issue.
{ "I am a happy cow": "en", "Je suis une vache heureuse": "en", "Ich bin eine glückliche Kuh": "en", "Soy una vaca feliz": "en", "Saya sapi yang bahagia": "en", "我是一头快乐的牛": "zh", "我是一頭快樂的牛": "zh", "私は幸せな牛です": "zh", "मैं एक खुश गाय हूँ": "hi", "انا بقرة سعيدة": "ar", "Я счастливая корова": null}
XRegExp
You can use XRegExp to make use of named unicode blocks or unicode scripts.
NOTE: I am not sure of status of unicode blocks/scripts
support in browsers and Node.js
npm install xregexp
const XRegExp = require('xregexp');function detectLanguage(text) { // split into words const langs = text.trim().split(/\s+/).map(word => { return detect(word) }) // pick the lang with the most occurances return (langs || []).reduce( ( acc, el ) => { acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1 acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el return acc }, { k:{} }).max function detect(text) { const scores = {} // https://en.wikipedia.org/wiki/Unicode_block // http://www.regular-expressions.info/unicode.html#script const regexes = { // en: /[a-zA-Z]+/gi, en: XRegExp('\\p{Latin}', 'gi'), zh: XRegExp('\\p{Han}', 'gi'), hi: XRegExp('\\p{Devanagari}', 'gi'), ar: XRegExp('\\p{Arabic}', 'gi'), bn: XRegExp('\\p{Bengali}', 'gi'), he: XRegExp('\\p{Hebrew}', 'gi'), ru: XRegExp('\\p{Cyrillic}', 'gi'), jp: XRegExp('[\\p{Hiragana}\\p{Katakana}]', 'gi'), pa: XRegExp('\\p{Gurmukhi}', 'gi') } for (const [lang, regex] of Object.entries(regexes)) { // detect occurances of lang in a word let matches = XRegExp.match(text, regex) || [] let score = matches.length / text.length if (score) { // high percentage, return result if (score > 0.85) { return lang } scores[lang] = score } } // not detected if (Object.keys(scores).length == 0) return null // pick lang with highest percentage return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b); }}
Result:
- Result is better as Japanese and Russian language characters detection is added
- It still cannot detect language in latin characters, which might need a language detection library.
{ "I am a happy cow": "en", "Je suis une vache heureuse": "en", "Ich bin eine glückliche Kuh": "en", "Soy una vaca feliz": "en", "Saya sapi yang bahagia": "en", "我是一头快乐的牛": "zh", "我是一頭快樂的牛": "zh", "私は幸せな牛です": "jp", "मैं एक खुश गाय हूँ": "hi", "انا بقرة سعيدة": "ar", "Я счастливая корова": "ru"}
I tested with longer phrase with has latin characters in it.
{ "What do we know so far about the Covid-19 variants?": "en", "Que savons-nous jusqu'à présent des variantes du Covid-19?": "en", "Was wissen wir bisher über die Covid-19-Varianten?": "en", "¿Qué sabemos hasta ahora sobre las variantes de Covid-19?": "en", "Apa yang kita ketahui sejauh ini tentang varian Covid-19?": "en", "到目前为止,我们对Covid-19变体了解多少?": "zh", "到目前為止,我們對Covid-19變體了解多少?": "zh", "Covid-19バリアントについてこれまでに何を知っていますか?": "jp", "हम कोविड -19 वेरिएंट के बारे में अब तक क्या जानते हैं?": "hi", "ما الذي نعرفه حتى الآن عن بدائل Covid-19؟": "ar", "Что мы знаем о вариантах Covid-19?": "ru"}