view runtime/tools/unicode.vim @ 20918:a253e5fc58d1

Added tag v8.2.1010 for changeset 5c6a6b5a33308a5e803f5289ea6c15c67d3f8db3
author Bram Moolenaar <Bram@vim.org>
date Fri, 19 Jun 2020 17:30:04 +0200
parents 723487cd7876
children bbca88cd13d5
line wrap: on
line source

" Script to extract tables from Unicode .txt files, to be used in src/mbyte.c.
" The format of the UnicodeData.txt file is explained here:
" http://www.unicode.org/Public/5.1.0/ucd/UCD.html
" For the other files see the header.
"
" Might need to update the URL to the emoji-data.txt
" Usage: Vim -S <this-file>
"
" Author: Bram Moolenaar
" Last Update: 2010 Jan 12

" Parse lines of UnicodeData.txt.  Creates a list of lists in s:dataprops.
func! ParseDataToProps()
  let s:dataprops = []
  let lnum = 1
  while lnum <= line('$')
    let l = split(getline(lnum), '\s*;\s*', 1)
    if len(l) != 15
      echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 15'
      return
    endif
    call add(s:dataprops, l)
    let lnum += 1
  endwhile
endfunc

" Parse lines of CaseFolding.txt.  Creates a list of lists in s:foldprops.
func! ParseFoldProps()
  let s:foldprops = []
  let lnum = 1
  while lnum <= line('$')
    let line = getline(lnum)
    if line !~ '^#' && line !~ '^\s*$'
      let l = split(line, '\s*;\s*', 1)
      if len(l) != 4
        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 4'
        return
      endif
      call add(s:foldprops, l)
    endif
    let lnum += 1
  endwhile
endfunc

" Parse lines of EastAsianWidth.txt.  Creates a list of lists in s:widthprops.
func! ParseWidthProps()
  let s:widthprops = []
  let lnum = 1
  while lnum <= line('$')
    let line = getline(lnum)
    if line !~ '^#' && line !~ '^\s*$'
      let l = split(line, '\s*;\s*', 1)
      if len(l) != 2
        echoerr 'Found ' . len(l) . ' items in line ' . lnum . ', expected 2'
        return
      endif
      call add(s:widthprops, l)
    endif
    let lnum += 1
  endwhile
endfunc

" Build the toLower or toUpper table in a new buffer.
" Uses s:dataprops.
func! BuildCaseTable(name, index)
  let start = -1
  let end = -1
  let step = 0
  let add = -1
  let ranges = []
  for p in s:dataprops
    if p[a:index] != ''
      let n = ('0x' . p[0]) + 0
      let nl = ('0x' . p[a:index]) + 0
      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
        " continue with same range.
        let step = n - end
        let end = n
      else
        if start >= 0
          " produce previous range
          call Range(ranges, start, end, step, add)
        endif
        let start = n
        let end = n
        let step = 0
        let add = nl - n
      endif
    endif
  endfor
  if start >= 0
    call Range(ranges, start, end, step, add)
  endif

  " New buffer to put the result in.
  new
  exe "file to" . a:name
  call setline(1, "static convertStruct to" . a:name . "[] =")
  call setline(2, "{")
  call append('$', ranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "};")
  wincmd p
endfunc

" Build the foldCase table in a new buffer.
" Uses s:foldprops.
func! BuildFoldTable()
  let start = -1
  let end = -1
  let step = 0
  let add = -1
  let ranges = []
  for p in s:foldprops
    if p[1] == 'C' || p[1] == 'S'
      let n = ('0x' . p[0]) + 0
      let nl = ('0x' . p[2]) + 0
      if start >= 0 && add == nl - n && (step == 0 || n - end == step)
        " continue with same range.
        let step = n - end
        let end = n
      else
        if start >= 0
          " produce previous range
          call Range(ranges, start, end, step, add)
        endif
        let start = n
        let end = n
        let step = 0
        let add = nl - n
      endif
    endif
  endfor
  if start >= 0
    call Range(ranges, start, end, step, add)
  endif

  " New buffer to put the result in.
  new
  file foldCase
  call setline(1, "static convertStruct foldCase[] =")
  call setline(2, "{")
  call append('$', ranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "};")
  wincmd p
endfunc

func! Range(ranges, start, end, step, add)
  let s = printf("\t{0x%x,0x%x,%d,%d},", a:start, a:end, a:step == 0 ? -1 : a:step, a:add)
  call add(a:ranges, s)
endfunc

" Build the combining table.
" Uses s:dataprops.
func! BuildCombiningTable()
  let start = -1
  let end = -1
  let ranges = []
  for p in s:dataprops
    if p[2] == 'Mn' || p[2] == 'Mc' || p[2] == 'Me'
      let n = ('0x' . p[0]) + 0
      if start >= 0 && end + 1 == n
        " continue with same range.
        let end = n
      else
        if start >= 0
          " produce previous range
          call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
        endif
        let start = n
        let end = n
      endif
    endif
  endfor
  if start >= 0
    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
  endif

  " New buffer to put the result in.
  new
  file combining
  call setline(1, "    static struct interval combining[] =")
  call setline(2, "    {")
  call append('$', ranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "    };")
  wincmd p
endfunc

" Build the double width or ambiguous width table in a new buffer.
" Uses s:widthprops and s:dataprops.
func! BuildWidthTable(pattern, tableName)
  let start = -1
  let end = -1
  let ranges = []
  let dataidx = 0
  for p in s:widthprops
    if p[1][0] =~ a:pattern
      if p[0] =~ '\.\.'
        " It is a range.  we don't check for composing char then.
        let rng = split(p[0], '\.\.')
        if len(rng) != 2
          echoerr "Cannot parse range: '" . p[0] . "' in width table"
        endif
        let n = ('0x' . rng[0]) + 0
        let n_last =  ('0x' . rng[1]) + 0
      else
        let n = ('0x' . p[0]) + 0
        let n_last = n
      endif
      " Find this char in the data table.
      while 1
        let dn = ('0x' . s:dataprops[dataidx][0]) + 0
        if dn >= n
          break
        endif
        let dataidx += 1
      endwhile
      if dn != n && n_last == n
        echoerr "Cannot find character " . n . " in data table"
      endif
      " Only use the char when it's not a composing char.
      " But use all chars from a range.
      let dp = s:dataprops[dataidx]
      if n_last > n || (dp[2] != 'Mn' && dp[2] != 'Mc' && dp[2] != 'Me')
        if start >= 0 && end + 1 == n
          " continue with same range.
        else
          if start >= 0
            " produce previous range
            call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
	    if a:pattern == 'A'
	      call add(s:ambitable, [start, end])
	    else
	      call add(s:doubletable, [start, end])
	    endif
          endif
          let start = n
        endif
        let end = n_last
      endif
    endif
  endfor
  if start >= 0
    call add(ranges, printf("\t{0x%04x, 0x%04x},", start, end))
    if a:pattern == 'A'
      call add(s:ambitable, [start, end])
    else
      call add(s:doubletable, [start, end])
    endif
  endif

  " New buffer to put the result in.
  new
  exe "file " . a:tableName
  call setline(1, "    static struct interval " . a:tableName . "[] =")
  call setline(2, "    {")
  call append('$', ranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "    };")
  wincmd p
endfunc

" Build the amoji width table in a new buffer.
func! BuildEmojiTable(pattern, tableName)
  let alltokens = []
  let widthtokens = []
  let lines = map(filter(filter(getline(1, '$'), 'v:val=~"^[1-9]"'), 'v:val=~a:pattern'), 'matchstr(v:val,"^\\S\\+")')
  for n in range(len(lines))
    let line = lines[n]
    let token = split(line, '\.\.')
    let first = ('0x' . token[0]) + 0
    if len(token) == 1
      let last = first
    else
      let last = ('0x' . token[1]) + 0
    endif

    let token = [first, last]
    if len(alltokens) > 0 && (token[0] - 1 == alltokens[-1][1])
      let alltokens[-1][1] = token[1]
    else
      call add(alltokens, token)
    endif

    " Characters below 1F000 may be considered single width traditionally,
    " making them double width causes problems.
    if first < 0x1f000
      continue
    endif

    " exclude characters that are in the "ambiguous" or "doublewidth" table
    for ambi in s:ambitable
      if first >= ambi[0] && first <= ambi[1]
	let first = ambi[1] + 1
      endif
      if last >= ambi[0] && last <= ambi[1]
	let last = ambi[0] - 1
      endif
    endfor
    for double in s:doubletable
      if first >= double[0] && first <= double[1]
	let first = double[1] + 1
      endif
      if last >= double[0] && last <= double[1]
	let last = double[0] - 1
      endif
    endfor

    if first <= last
      let token = [first, last]
      if len(widthtokens) > 0 && (token[0] - 1 == widthtokens[-1][1])
	let widthtokens[-1][1] = token[1]
      else
	call add(widthtokens, token)
      endif
    endif
  endfor
  let allranges = map(alltokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')
  let widthranges = map(widthtokens, 'printf("\t{0x%04x, 0x%04x},", v:val[0], v:val[1])')

  " New buffer to put the result in.
  new
  exe "file " . a:tableName . '_all'
  call setline(1, "    static struct interval " . a:tableName . "_all[] =")
  call setline(2, "    {")
  call append('$', allranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "    };")
  wincmd p

  " New buffer to put the result in.
  new
  exe "file " . a:tableName . '_width'
  call setline(1, "    static struct interval " . a:tableName . "_width[] =")
  call setline(2, "    {")
  call append('$', widthranges)
  call setline('$', getline('$')[:-2])  " remove last comma
  call setline(line('$') + 1, "    };")
  wincmd p
endfunc

" Try to avoid hitting E36
set equalalways

" Edit the Unicode text file.  Requires the netrw plugin.
edit http://unicode.org/Public/UNIDATA/UnicodeData.txt

" Parse each line, create a list of lists.
call ParseDataToProps()

" Build the toLower table.
call BuildCaseTable("Lower", 13)

" Build the toUpper table.
call BuildCaseTable("Upper", 12)

" Build the ranges of composing chars.
call BuildCombiningTable()

" Edit the case folding text file.  Requires the netrw plugin.
edit http://www.unicode.org/Public/UNIDATA/CaseFolding.txt

" Parse each line, create a list of lists.
call ParseFoldProps()

" Build the foldCase table.
call BuildFoldTable()

" Edit the width text file.  Requires the netrw plugin.
edit http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt

" Parse each line, create a list of lists.
call ParseWidthProps()

" Build the double width table.
let s:doubletable = []
call BuildWidthTable('[WF]', 'doublewidth')

" Build the ambiguous width table.
let s:ambitable = []
call BuildWidthTable('A', 'ambiguous')

" Edit the emoji text file.  Requires the netrw plugin.
edit https://www.unicode.org/Public/emoji/11.0/emoji-data.txt
"edit http://www.unicode.org/Public/emoji/latest/emoji-data.txt

" Build the emoji table. Ver. 1.0 - 6.0
" Must come after the "ambiguous" table
call BuildEmojiTable('; Emoji\s\+#\s\+\d\+\.\d', 'emoji')