JavaScript: Regexp for Language Detection (Node.js)

The function split phrase into words (by space) and detect language of each word (by occurances of language's characters).

E.g. My home is 马来西亚 = 'en', 'en', 'en', 'zh', thus this phrase in English.

Currently it only supports English (all latin characters including German, Spainish, French, Indonesian, etc will be categorized as English), Chinse, Hindi, Arabic, Bengali, Hebrew.

function detectLanguage(text) {  // split into words  const langs = text.trim().split(/\s+/).map(word => {    return detect(word)  })  // pick the lang with the most occurances  return (langs || []).reduce( ( acc, el ) => {    acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1    acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el    return acc    }, { k:{} }).max  function detect(text) {    const scores = {}        const regexes = {      'en': /[\u0000-\u007F]/gi,      'zh': /[\u3000\u3400-\u4DBF\u4E00-\u9FFF]/gi,      'hi': /[\u0900-\u097F]/gi,      'ar': /[\u0621-\u064A\u0660-\u0669]/gi,      'bn': /[\u0995-\u09B9\u09CE\u09DC-\u09DF\u0985-\u0994\u09BE-\u09CC\u09D7\u09BC]/gi,      'he': /[\u0590-\u05FF]/gi,    }    for (const [lang, regex] of Object.entries(regexes)) {      // detect occurances of lang in a word      let matches = text.match(regex) || []      let score = matches.length / text.length      if (score) {        // high percentage, return result        if (score > 0.85) {          return lang        }        scores[lang] = score      }    }    // not detected    if (Object.keys(scores).length == 0)      return null    // pick lang with highest percentage    return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b);  }}

Test

const values = [  'I am a happy cow',           // English  'Je suis une vache heureuse', // French  'Ich bin eine glückliche Kuh',// German  'Soy una vaca feliz',         // Spanish  'Saya sapi yang bahagia',     // Indonesian  '我是一头快乐的牛',             // Chinese Simplified  '我是一頭快樂的牛',             // Chinese Traditional   '私は幸せな牛です',             // Japanese  'मैं एक खुश गाय हूँ',              // Hindi  'انا بقرة سعيدة',             // Arabic  'Я счастливая корова'         // Russian]const data = {}values.forEach(value => {  data[value] = detectLanguage(value)})console.log(data)

Result:

All latin characters all categorized as English
Japanse is detected as Chinse (it contains 3 Chinese characters). Adding Japanse character detection would mitigate this issue.

{  "I am a happy cow": "en",  "Je suis une vache heureuse": "en",  "Ich bin eine glückliche Kuh": "en",  "Soy una vaca feliz": "en",  "Saya sapi yang bahagia": "en",  "我是一头快乐的牛": "zh",  "我是一頭快樂的牛": "zh",  "私は幸せな牛です": "zh",  "मैं एक खुश गाय हूँ": "hi",  "انا بقرة سعيدة": "ar",  "Я счастливая корова": null}

XRegExp

You can use XRegExp to make use of named unicode blocks or unicode scripts.

NOTE: I am not sure of status of unicode blocks/scripts support in browsers and Node.js

npm install xregexp

const XRegExp = require('xregexp');function detectLanguage(text) {  // split into words  const langs = text.trim().split(/\s+/).map(word => {    return detect(word)  })  // pick the lang with the most occurances  return (langs || []).reduce( ( acc, el ) => {    acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1    acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el    return acc    }, { k:{} }).max  function detect(text) {    const scores = {}    // https://en.wikipedia.org/wiki/Unicode_block    // http://www.regular-expressions.info/unicode.html#script    const regexes = {      // en: /[a-zA-Z]+/gi,      en: XRegExp('\\p{Latin}', 'gi'),      zh: XRegExp('\\p{Han}', 'gi'),      hi: XRegExp('\\p{Devanagari}', 'gi'),      ar: XRegExp('\\p{Arabic}', 'gi'),      bn: XRegExp('\\p{Bengali}', 'gi'),      he: XRegExp('\\p{Hebrew}', 'gi'),      ru: XRegExp('\\p{Cyrillic}', 'gi'),      jp: XRegExp('[\\p{Hiragana}\\p{Katakana}]', 'gi'),      pa: XRegExp('\\p{Gurmukhi}', 'gi')    }    for (const [lang, regex] of Object.entries(regexes)) {      // detect occurances of lang in a word      let matches = XRegExp.match(text, regex) || []      let score = matches.length / text.length      if (score) {        // high percentage, return result        if (score > 0.85) {          return lang        }        scores[lang] = score      }    }    // not detected    if (Object.keys(scores).length == 0)      return null    // pick lang with highest percentage    return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b);  }}

Result:

Result is better as Japanese and Russian language characters detection is added
It still cannot detect language in latin characters, which might need a language detection library.

{  "I am a happy cow": "en",  "Je suis une vache heureuse": "en",  "Ich bin eine glückliche Kuh": "en",  "Soy una vaca feliz": "en",  "Saya sapi yang bahagia": "en",  "我是一头快乐的牛": "zh",  "我是一頭快樂的牛": "zh",  "私は幸せな牛です": "jp",  "मैं एक खुश गाय हूँ": "hi",  "انا بقرة سعيدة": "ar",  "Я счастливая корова": "ru"}

I tested with longer phrase with has latin characters in it.

{  "What do we know so far about the Covid-19 variants?": "en",  "Que savons-nous jusqu'à présent des variantes du Covid-19?": "en",  "Was wissen wir bisher über die Covid-19-Varianten?": "en",  "¿Qué sabemos hasta ahora sobre las variantes de Covid-19?": "en",  "Apa yang kita ketahui sejauh ini tentang varian Covid-19?": "en",  "到目前为止，我们对Covid-19变体了解多少？": "zh",  "到目前為止，我們對Covid-19變體了解多少？": "zh",  "Covid-19バリアントについてこれまでに何を知っていますか？": "jp",  "हम कोविड -19 वेरिएंट के बारे में अब तक क्या जानते हैं?": "hi",  "ما الذي نعرفه حتى الآن عن بدائل Covid-19؟": "ar",  "Что мы знаем о вариантах Covid-19?": "ru"}