JavaScript: Regexp for Language Detection (Node.js)

May 12, 2021

The function split phrase into words (by space) and detect language of each word (by occurances of language’s characters).

E.g. My home is 马来西亚 = [‘en’, ‘en’, ‘en’, ‘zh’], thus this phrase in English.

Currently it only supports English (all latin characters including German, Spainish, French, Indonesian, etc will be categorized as English), Chinse, Hindi, Arabic, Bengali, Hebrew.

function detectLanguage(text) {
  // split into words
  const langs = text.trim().split(/\s+/).map(word => {
    return detect(word)
  })

  // pick the lang with the most occurances
  return (langs || []).reduce( ( acc, el ) => {
    acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1
    acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el
    return acc  
  }, { k:{} }).max

  function detect(text) {

    const scores = {}
    
    const regexes = {
      'en': /[\u0000-\u007F]/gi,
      'zh': /[\u3000\u3400-\u4DBF\u4E00-\u9FFF]/gi,
      'hi': /[\u0900-\u097F]/gi,
      'ar': /[\u0621-\u064A\u0660-\u0669]/gi,
      'bn': /[\u0995-\u09B9\u09CE\u09DC-\u09DF\u0985-\u0994\u09BE-\u09CC\u09D7\u09BC]/gi,
      'he': /[\u0590-\u05FF]/gi,
    }

    for (const [lang, regex] of Object.entries(regexes)) {
      // detect occurances of lang in a word
      let matches = text.match(regex) || []
      let score = matches.length / text.length

      if (score) {
        // high percentage, return result
        if (score > 0.85) {
          return lang
        }
        scores[lang] = score
      }
    }

    // not detected
    if (Object.keys(scores).length == 0)
      return null

    // pick lang with highest percentage
    return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b);
  }
}

Test

const values = [
  'I am a happy cow',           // English
  'Je suis une vache heureuse', // French
  'Ich bin eine glückliche Kuh',// German
  'Soy una vaca feliz',         // Spanish
  'Saya sapi yang bahagia',     // Indonesian
  '我是一头快乐的牛',             // Chinese Simplified
  '我是一頭快樂的牛',             // Chinese Traditional 
  '私は幸せな牛です',             // Japanese
  'मैं एक खुश गाय हूँ',              // Hindi
  'انا بقرة سعيدة',             // Arabic
  'Я счастливая корова'         // Russian
]

const data = {}

values.forEach(value => {
  data[value] = detectLanguage(value)
})

console.log(data)

Result:

  • All latin characters all categorized as English
  • Japanse is detected as Chinse (it contains 3 Chinese characters). Adding Japanse character detection would mitigate this issue.
{
  "I am a happy cow": "en",
  "Je suis une vache heureuse": "en",
  "Ich bin eine glückliche Kuh": "en",
  "Soy una vaca feliz": "en",
  "Saya sapi yang bahagia": "en",
  "我是一头快乐的牛": "zh",
  "我是一頭快樂的牛": "zh",
  "私は幸せな牛です": "zh",
  "मैं एक खुश गाय हूँ": "hi",
  "انا بقرة سعيدة": "ar",
  "Я счастливая корова": null
}

XRegExp

You can use XRegExp to make use of named unicode blocks or unicode scripts.

NOTE: I am not sure of status of unicode blocks/scripts support in browsers and Node.js

npm install xregexp
const XRegExp = require('xregexp');

function detectLanguage(text) {
  // split into words
  const langs = text.trim().split(/\s+/).map(word => {
    return detect(word)
  })

  // pick the lang with the most occurances
  return (langs || []).reduce( ( acc, el ) => {
    acc.k[el] = acc.k[el] ? acc.k[el] + 1 : 1
    acc.max = acc.max ? acc.max < acc.k[el] ? el : acc.max : el
    return acc  
  }, { k:{} }).max

  function detect(text) {

    const scores = {}

    // https://en.wikipedia.org/wiki/Unicode_block
    // http://www.regular-expressions.info/unicode.html#script
    const regexes = {
      // en: /[a-zA-Z]+/gi,
      en: XRegExp('\\p{Latin}', 'gi'),
      zh: XRegExp('\\p{Han}', 'gi'),
      hi: XRegExp('\\p{Devanagari}', 'gi'),
      ar: XRegExp('\\p{Arabic}', 'gi'),
      bn: XRegExp('\\p{Bengali}', 'gi'),
      he: XRegExp('\\p{Hebrew}', 'gi'),
      ru: XRegExp('\\p{Cyrillic}', 'gi'),
      jp: XRegExp('[\\p{Hiragana}\\p{Katakana}]', 'gi'),
      pa: XRegExp('\\p{Gurmukhi}', 'gi')
    }

    for (const [lang, regex] of Object.entries(regexes)) {
      // detect occurances of lang in a word
      let matches = XRegExp.match(text, regex) || []
      let score = matches.length / text.length

      if (score) {
        // high percentage, return result
        if (score > 0.85) {
          return lang
        }
        scores[lang] = score
      }
    }

    // not detected
    if (Object.keys(scores).length == 0)
      return null

    // pick lang with highest percentage
    return Object.keys(scores).reduce((a, b) => scores[a] > scores[b] ? a : b);
  }
}

Result:

  • Result is better as Japanese and Russian language characters detection is added
  • It still cannot detect language in latin characters, which might need a language detection library.
{
  "I am a happy cow": "en",
  "Je suis une vache heureuse": "en",
  "Ich bin eine glückliche Kuh": "en",
  "Soy una vaca feliz": "en",
  "Saya sapi yang bahagia": "en",
  "我是一头快乐的牛": "zh",
  "我是一頭快樂的牛": "zh",
  "私は幸せな牛です": "jp",
  "मैं एक खुश गाय हूँ": "hi",
  "انا بقرة سعيدة": "ar",
  "Я счастливая корова": "ru"
}

I tested with longer phrase with has latin characters in it.

{
  "What do we know so far about the Covid-19 variants?": "en",
  "Que savons-nous jusqu'à présent des variantes du Covid-19?": "en",
  "Was wissen wir bisher über die Covid-19-Varianten?": "en",
  "¿Qué sabemos hasta ahora sobre las variantes de Covid-19?": "en",
  "Apa yang kita ketahui sejauh ini tentang varian Covid-19?": "en",
  "到目前为止,我们对Covid-19变体了解多少?": "zh",
  "到目前為止,我們對Covid-19變體了解多少?": "zh",
  "Covid-19バリアントについてこれまでに何を知っていますか?": "jp",
  "हम कोविड -19 वेरिएंट के बारे में अब तक क्या जानते हैं?": "hi",
  "ما الذي نعرفه حتى الآن عن بدائل Covid-19؟": "ar",
  "Что мы знаем о вариантах Covid-19?": "ru"
}
This work is licensed under a
Creative Commons Attribution-NonCommercial 4.0 International License.