• (Long post) Metaphone Algorithm In AWK

    From Mike Sanders@21:1/5 to All on Sat Aug 17 14:18:58 2024
    Hi folks, hope you all are doing well.

    Please excuse long post, wanted to share this, some might find
    it handy given a certain context. Must run, I'm very behind in
    my work (hey I'm always running behind!)


    # metaphone.awk: Michael Sanders - 2024
    #
    # example invocation:
    #
    # echo "texas taxes taxi" | awk -f metaphone.awk -v find=texas
    #
    # notes:
    #
    # ever notice when you search for (say):
    #
    # 'i went to the zu'
    #
    # & your chosen search engine suggests something like:
    #
    # 'did you mean i went to the zoo'
    #
    # the metaphone algorithm handles such cases pretty well actually...
    #
    # Metaphone is a phonetic algorithm, published by Lawrence Philips in
    # 1990, for indexing words by their English pronunciation. It
    # fundamentally improves on the Soundex algorithm by using information
    # about variations and inconsistencies in English spelling and
    # pronunciation to produce a more accurate encoding, which does a
    # better job of matching words and names which sound similar.
    # https://en.wikipedia.org/wiki/Metaphone
    #
    # english only (sorry)
    #
    # not extensively tested, nevertheless a solid start, if you
    # improve this code please share your results
    #
    # other implentations...
    #
    # gist: https://gist.github.com/Rostepher/b688f709587ac145a0b3
    #
    # BASIC: http://aspell.net/metaphone/metaphone.basic
    #
    # C: http://aspell.net/metaphone/metaphone-kuhn.txt


    # check if a character is a vowel
    function isvowel(c, is_vowel) {
    is_vowel = c ~ /[AEIOU]/
    return is_vowel
    }

    # add a character or string to the result array
    function phonize(s, result, p_idx, i) {
    for (i = 1; i <= length(s); i++) {
    result[p_idx++] = substr(s, i, 1)
    }
    return p_idx
    }

    # compute metaphone code
    function metaphone(word, max_phonemes, result, p_idx, w_idx, c) {
    w_idx = 1
    p_idx = 1
    while (w_idx <= length(word) && p_idx <= max_phonemes) {
    c = toupper(substr(word, w_idx, 1))

    if (c == "B") {
    if (w_idx == 1 ||
    toupper(substr(word, w_idx - 1, 1)) != "M") {
    p_idx = phonize("B", result, p_idx)
    }
    } else if (c == "C") {
    if (toupper(substr(word, w_idx + 1, 1)) == "I" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) {
    p_idx = phonize("SH", result, p_idx)
    } else if (toupper(substr(word, w_idx + 1, 1)) == "H") {
    p_idx = phonize("X", result, p_idx)
    w_idx++
    } else {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "D") {
    if (toupper(substr(word, w_idx + 1, 1)) == "G" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[EIY]/) {
    p_idx = phonize("J", result, p_idx)
    w_idx++
    } else {
    p_idx = phonize("T", result, p_idx)
    }
    } else if (c == "G") {
    if (toupper(substr(word, w_idx + 1, 1)) == "H") {
    if (w_idx > 1 &&
    toupper(substr(word, w_idx - 1, 1)) !~ /[BDH]/ &&
    toupper(substr(word, w_idx - 2, 1)) != "H") {
    p_idx = phonize("F", result, p_idx)
    w_idx++
    }
    } else if (toupper(substr(word, w_idx + 1, 1)) != "N" ||
    toupper(substr(word, w_idx + 2, 1)) != "E") {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "H") {
    if (isvowel(toupper(substr(word, w_idx + 1, 1))) &&
    toupper(substr(word, w_idx - 1, 1)) !~ /[CGPST]/) {
    p_idx = phonize("H", result, p_idx)
    }
    } else if (c == "K") {
    if (w_idx == 1 ||
    toupper(substr(word, w_idx - 1, 1)) != "C") {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "P") {
    if (toupper(substr(word, w_idx + 1, 1)) == "H") {
    p_idx = phonize("F", result, p_idx)
    } else {
    p_idx = phonize("P", result, p_idx)
    }
    } else if (c == "Q") {
    p_idx = phonize("K", result, p_idx)
    } else if (c == "S") {
    if (toupper(substr(word, w_idx + 1, 1)) == "H") {
    p_idx = phonize("SH", result, p_idx)
    w_idx++
    } else if (toupper(substr(word, w_idx + 1, 1)) == "C" &&
    toupper(substr(word, w_idx + 2, 1)) == "H") {
    p_idx = phonize("X", result, p_idx)
    w_idx += 2
    } else {
    p_idx = phonize("S", result, p_idx)
    }
    } else if (c == "T") {
    if (toupper(substr(word, w_idx + 1, 1)) == "I" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) {
    p_idx = phonize("SH", result, p_idx)
    } else if (toupper(substr(word, w_idx + 1, 1)) == "H") {
    p_idx = phonize("TH", result, p_idx)
    w_idx++
    } else if (toupper(substr(word, w_idx + 1, 1)) != "C" ||
    toupper(substr(word, w_idx + 2, 1)) != "H") {
    p_idx = phonize("T", result, p_idx)
    }
    } else if (c == "V") {
    p_idx = phonize("F", result, p_idx)
    } else if (c == "W" || c == "Y") {
    if (isvowel(toupper(substr(word, w_idx + 1, 1)))) {
    p_idx = phonize(c, result, p_idx)
    }
    } else if (c == "X") {
    p_idx = phonize("KS", result, p_idx)
    } else if (c == "Z") {
    p_idx = phonize("S", result, p_idx)
    }

    w_idx++
    }

    if (p_idx > max_phonemes) p_idx = max_phonemes

    return substr(combine(result), 1, p_idx)
    }

    # combine array into a string
    function combine(array, result, i) {
    result = ""
    for (i in array) {
    result = result array[i]
    }
    return result
    }

    BEGIN {
    # store metaphone code for "find" variable
    find_code = metaphone(find, 4)
    }

    {
    # loop through all fields (words) in the input line
    for (x = 1; x <= NF; x++) {
    # compute metaphone code for the current word
    word_code = metaphone($x, 4)
    # check if metaphone code of the current word matches
    if (word_code == find_code) {
    print $x # print matching word
    }
    }
    }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Mike Sanders on Mon Aug 19 00:46:46 2024
    porkchop@invalid.foo (Mike Sanders) writes:

    Hi folks, hope you all are doing well.

    Please excuse long post, wanted to share this, some might find
    it handy given a certain context. Must run, I'm very behind in
    my work (hey I'm always running behind!)

    Using a word list, I found some odd matches. For example:

    $ echo "drunkeness indigestion" | awk -f metaphone.awk -v find=texas
    drunkeness
    indigestion

    Are these really metaphone matches for "texas"? It's possible (I don't
    know the algorithm at all well) but I found it surprising.

    # metaphone.awk: Michael Sanders - 2024
    #
    # example invocation:
    #
    # echo "texas taxes taxi" | awk -f metaphone.awk -v find=texas
    #
    # notes:
    #
    # ever notice when you search for (say):
    #
    # 'i went to the zu'
    #
    # & your chosen search engine suggests something like:
    #
    # 'did you mean i went to the zoo'
    #
    # the metaphone algorithm handles such cases pretty well actually...
    #
    # Metaphone is a phonetic algorithm, published by Lawrence Philips in
    # 1990, for indexing words by their English pronunciation. It
    # fundamentally improves on the Soundex algorithm by using information
    # about variations and inconsistencies in English spelling and
    # pronunciation to produce a more accurate encoding, which does a
    # better job of matching words and names which sound similar.
    # https://en.wikipedia.org/wiki/Metaphone
    #
    # english only (sorry)
    #
    # not extensively tested, nevertheless a solid start, if you
    # improve this code please share your results
    #
    # other implentations...
    #
    # gist: https://gist.github.com/Rostepher/b688f709587ac145a0b3
    #
    # BASIC: http://aspell.net/metaphone/metaphone.basic
    #
    # C: http://aspell.net/metaphone/metaphone-kuhn.txt

    I wanted a "reference" implementation I could try, but this is not a
    useful C program. It's in a odd dialect (it uses void but has K&R
    function definitions) and has loads of undefined behaviours (strcpy of overlapping strings, use if uninitialised variables etc).

    # check if a character is a vowel
    function isvowel(c, is_vowel) {
    is_vowel = c ~ /[AEIOU]/
    return is_vowel
    }

    I was not going to comment on the code, but this hit me just before I
    posted. Given the odd way AWK functions have to define locals, I tend
    to use them only when really needed. Here I think I would just write

    function isvowel(c) {
    return c ~ /[AEIOU]/
    }

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Ben Bacarisse on Mon Aug 19 02:15:43 2024
    A correction...

    Ben Bacarisse <ben@bsb.me.uk> writes:

    porkchop@invalid.foo (Mike Sanders) writes:

    Hi folks, hope you all are doing well.

    Please excuse long post, wanted to share this, some might find
    it handy given a certain context. Must run, I'm very behind in
    my work (hey I'm always running behind!)

    Using a word list, I found some odd matches. For example:

    $ echo "drunkeness indigestion" | awk -f metaphone.awk -v find=texas drunkeness
    indigestion

    Are these really metaphone matches for "texas"? It's possible (I don't
    know the algorithm at all well) but I found it surprising.

    I got the C code to compile and these should not match if the C code is
    working correctly.

    # metaphone.awk: Michael Sanders - 2024
    #
    # example invocation:
    #
    # echo "texas taxes taxi" | awk -f metaphone.awk -v find=texas
    #
    # notes:
    #
    # ever notice when you search for (say):
    #
    # 'i went to the zu'
    #
    # & your chosen search engine suggests something like:
    #
    # 'did you mean i went to the zoo'
    #
    # the metaphone algorithm handles such cases pretty well actually...
    #
    # Metaphone is a phonetic algorithm, published by Lawrence Philips in
    # 1990, for indexing words by their English pronunciation. It
    # fundamentally improves on the Soundex algorithm by using information
    # about variations and inconsistencies in English spelling and
    # pronunciation to produce a more accurate encoding, which does a
    # better job of matching words and names which sound similar.
    # https://en.wikipedia.org/wiki/Metaphone
    #
    # english only (sorry)
    #
    # not extensively tested, nevertheless a solid start, if you
    # improve this code please share your results
    #
    # other implentations...
    #
    # gist: https://gist.github.com/Rostepher/b688f709587ac145a0b3
    #
    # BASIC: http://aspell.net/metaphone/metaphone.basic
    #
    # C: http://aspell.net/metaphone/metaphone-kuhn.txt

    I wanted a "reference" implementation I could try, but this is not a
    useful C program. It's in a odd dialect (it uses void but has K&R
    function definitions) and has loads of undefined behaviours (strcpy of overlapping strings, use if uninitialised variables etc).

    The uninitialised variables were due to an undefined function. Most
    likely, that function was intended to initialise the array. I've mocked
    up the two undefined functions and can now get the code to run. I don't
    see any uninitialised variables being used now. The code still has
    undefined behaviour in some cases but I think that is limited to the use
    of strcpy.

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Ben Bacarisse on Mon Aug 19 03:22:40 2024
    Ben Bacarisse <ben@bsb.me.uk> wrote:

    Using a word list, I found some odd matches.

    Hey Ben. Probaly a ton of them (see refactored script below...)

    Most often used invocation this time around was:

    echo "taxes drunkeness indigestion" | awk -f metaphone.awk -v find=texas

    Lengthened phoneme count, accounted for doublets like "ness",
    added Levenshtein distance & note next url (& its note near
    the bottom of the page):

    https://tilores.io/metaphone-phonetic-algorithm-online-tool

    Likely lots left to do. Have at it if you're in the mood,
    I'll weave in your & others changes (with credit) & repost
    here if & when they appear.

    Will catch up soon & have fun =)

    # Metaphone Algorithm in AWK v2: Michael Sanders - 2024

    BEGIN { find_code = metaphone(find, 10) }

    # -----------------------------------------------------------------

    # entry point...
    {
    for (x = 1; x <= NF; x++) {
    if (similarity(metaphone($x, 10), find_code) >= 80)
    print find " : " $x
    }
    }

    # -----------------------------------------------------------------

    # check if character is a vowel
    function isvowel(c, is_vowel) { return c ~ /[AEIOU]/ }

    # -----------------------------------------------------------------

    # combine array into a string
    function combine(array, result, i) {
    for (i in array) { result = result array[i] }

    return result
    }

    # -----------------------------------------------------------------

    # add a character or string to the result array
    function phonize(s, result, p_idx, i) {
    for (i = 1; i <= length(s); i++) {
    result[p_idx++] = substr(s, i, 1)
    }
    return p_idx
    }

    # -----------------------------------------------------------------

    # compute metaphone code
    function metaphone(word, max_phonemes, result,
    p_idx, w_idx, c, next_c, last_c) {
    w_idx = 1
    p_idx = 1

    while (w_idx <= length(word) && p_idx <= max_phonemes) {
    c = toupper(substr(word, w_idx, 1))
    next_c = toupper(substr(word, w_idx + 1, 1))

    # skip duplicate letters
    if (c == last_c) {
    w_idx++
    continue
    }

    if (c == "B") {
    if (w_idx == 1 ||
    toupper(substr(word, w_idx - 1, 1)) != "M") {
    p_idx = phonize("B", result, p_idx)
    }
    } else if (c == "C") {
    if (next_c == "I" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) {
    p_idx = phonize("SH", result, p_idx)
    } else if (next_c == "H") {
    p_idx = phonize("X", result, p_idx)
    w_idx++
    } else {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "D") {
    if (next_c == "G" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[EIY]/) {
    p_idx = phonize("J", result, p_idx)
    w_idx++
    } else {
    p_idx = phonize("T", result, p_idx)
    }
    } else if (c == "G") {
    if (next_c == "I" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[EOY]/) {
    p_idx = phonize("J", result, p_idx)
    } else if (next_c == "H" &&
    !(w_idx > 1 &&
    toupper(substr(word, w_idx - 1, 1)) ~ /[BDH]/)) {
    p_idx = phonize("F", result, p_idx)
    w_idx++
    } else if (next_c != "N" ||
    toupper(substr(word, w_idx + 2, 1)) != "E") {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "H") {
    if (isvowel(next_c) &&
    toupper(substr(word, w_idx - 1, 1)) !~ /[CGPST]/) {
    p_idx = phonize("H", result, p_idx)
    }
    } else if (c == "K") {
    if (w_idx == 1 ||
    toupper(substr(word, w_idx - 1, 1)) != "C") {
    p_idx = phonize("K", result, p_idx)
    }
    } else if (c == "N") {
    p_idx = phonize("N", result, p_idx)
    } else if (c == "P") {
    if (next_c == "H") {
    p_idx = phonize("F", result, p_idx)
    } else {
    p_idx = phonize("P", result, p_idx)
    }
    } else if (c == "Q") {
    p_idx = phonize("K", result, p_idx)
    } else if (c == "R") {
    p_idx = phonize("R", result, p_idx)
    } else if (c == "S") {
    if (next_c == "H") {
    p_idx = phonize("SH", result, p_idx)
    w_idx++
    } else if (next_c == "C" &&
    toupper(substr(word, w_idx + 2, 1)) == "H") {
    p_idx = phonize("X", result, p_idx)
    w_idx += 2
    } else {
    p_idx = phonize("S", result, p_idx)
    }
    } else if (c == "T") {
    if (next_c == "I" &&
    toupper(substr(word, w_idx + 2, 1)) ~ /[AO]/) {
    p_idx = phonize("X", result, p_idx)
    } else if (next_c == "H") {
    p_idx = phonize("0", result, p_idx)
    w_idx++
    } else if (next_c != "C" ||
    toupper(substr(word, w_idx + 2, 1)) != "H") {
    p_idx = phonize("T", result, p_idx)
    }
    } else if (c == "V") {
    p_idx = phonize("F", result, p_idx)
    } else if (c == "W" || c == "Y") {
    if (isvowel(next_c)) {
    p_idx = phonize(c, result, p_idx)
    }
    } else if (c == "X") {
    p_idx = phonize("KS", result, p_idx)
    } else if (c == "Z") {
    p_idx = phonize("S", result, p_idx)
    }

    last_c = c # Track last character processed
    w_idx++
    }

    if (p_idx > max_phonemes) p_idx = max_phonemes

    return substr(combine(result), 1, p_idx)
    }

    # -----------------------------------------------------------------

    # calculate levenshtein distance between 2 words
    # https://en.wikipedia.org/wiki/Levenshtein_distance
    # https://rosettacode.org/wiki/Levenshtein_distance#AWK

    function levenshtein(s1, s2, l1, l2, cost, i, j, dist) {
    l1 = length(s1)
    l2 = length(s2)

    # initialize distance array
    for (i = 0; i <= l1; i++) dist[i, 0] = i
    for (j = 0; j <= l2; j++) dist[0, j] = j

    # compute distance
    for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
    cost = (substr(s1, i, 1) ==
    substr(s2, j, 1)) ? 0 : 1
    dist[i, j] = min3(dist[i-1, j] + 1, # deletion
    dist[i, j-1] + 1, # insertion
    dist[i-1, j-1] + cost) # substitution
    }
    }

    return dist[l1, l2]
    }

    # -----------------------------------------------------------------

    # minimum of 3 numbers (levenshtein helper function)
    function min3(a, b, c) {
    return (a < b ? (a < c ? a : c) : (b < c ? b : c))
    }

    # -----------------------------------------------------------------

    # calculate similarity as a percentage
    function similarity(word1, word2, lev_dist, max_len) {
    lev_dist = levenshtein(word1, word2)
    max_len = (length(word1) >
    length(word2)) ?
    length(word1) :
    length(word2)

    if (max_len == 0) return 100 # both strings empty

    return (1 - lev_dist / max_len) * 100
    }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Mike Sanders on Mon Aug 19 04:34:26 2024
    Mike Sanders <porkchop@invalid.foo> wrote:

    # Metaphone Algorithm in AWK v2: Michael Sanders - 2024

    [...]

    # entry point...
    {
    for (x = 1; x <= NF; x++) {
    if (similarity(metaphone($x, 10), find_code) >= 80)
    print find " : " $x
    }
    }

    # entry point...
    #
    # match is considered true when:
    # similarity is at least 80%
    # distance is 3 or less
    # phonetic encoding is equal
    #
    # https://tilores.io/metaphone-phonetic-algorithm-online-tool

    {
    for (x = 1; x <= NF; x++) {
    word_code = metaphone($x, 10)
    dist = levenshtein($x, find)
    psim = similarity($x, find)
    if (psim >= 80 && dist <= 3 && word_code == find_code)
    print find " : " $x
    }
    }

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Ben Bacarisse on Tue Aug 20 05:45:33 2024
    Ben Bacarisse <ben@bsb.me.uk> wrote:

    Using a word list, I found some odd matches. For example:

    $ echo "drunkeness indigestion" | awk -f metaphone.awk -v find=texas drunkeness
    indigestion

    Are these really metaphone matches for "texas"? It's possible (I don't
    know the algorithm at all well) but I found it surprising.

    Ben, give this try when you can. Finally starting to wrap my mind around
    its usage a little more...

    # Metaphone Algorithm In AWK v3: Michael Sanders - 2024
    #
    # tighter, cleaner, better
    #
    # example invocation: awk -f metephone.awk -v find=cork < words.txt

    BEGIN { find_code = metaphone(find) }

    # -----------------------------------------------------------------

    # emit metaphone codes only
    # { for (x = 1; x <= NF; x++) print $x " : " metaphone($x) }

    # tweek levenshtein distance to open/constrain results...
    {

    for (x = 1; x <= NF; x++)
    if (metaphone($x) == find_code && levenshtein($x, find) <= 2)
    print $x " : " find

    }

    # -----------------------------------------------------------------

    function isvowel(char) { return char ~ /[AEIOU]/ }

    # -----------------------------------------------------------------

    function metaphone(word, m, c, next_c, len, i) {
    word = toupper(word)
    gsub(/[^A-Z]/, "", word) # strip non-alphabetic characters
    len = length(word)

    # handle initial letters
    if (substr(word, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
    word = substr(word, 2)
    len--
    }

    for (i = 1; i <= len; i++) {
    c = substr(word, i, 1)
    next_c = (i < len) ? substr(word, i + 1, 1) : ""

    # skip duplicate letters except for 'C'
    if (i > 1 && c == substr(word, i - 1, 1) && c != "C") continue

    # handle vowels: retain only if it's 1st letter
    if (isvowel(c)) {
    if (i == 1) m = m c
    }
    # consonants
    else if (c == "B") {
    if (!(i == len && substr(word, i - 1, 1) == "M")) m = m "B"
    }
    else if (c == "C") {
    if (substr(word, i, 2) == "CH") {
    m = m "X"
    i++
    } else if (substr(word, i, 2) ~ /^(CI|CE|CY)/) {
    m = m "S"
    } else {
    m = m "K"
    }
    }
    else if (c == "D") {
    if (substr(word, i, 2) == "DG" && substr(word, i + 2, 1) ~ /[IEY]/) {
    m = m "J"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "G") {
    if (substr(word, i, 2) == "GH" && (i == 1 || !isvowel(substr(word, i - 1, 1)))) {
    i++
    } else if (substr(word, i, 2) == "GN" || (i == len && c == "G")) {
    continue
    } else if (substr(word, i, 3) ~ /^(GIA|GIE|GEY)/) {
    m = m "J"
    } else {
    m = m "K"
    }
    }
    else if (c == "H") {
    if (i == 1 || substr(word, i - 1, 1) !~ /[CSPTG]/) {
    if (i < len && !isvowel(next_c)) {
    m = m "H"
    }
    }
    }
    else if (c == "K") {
    if (i == 1 || substr(word, i - 1, 1) != "C") m = m "K"
    }
    else if (c == "P") {
    if (substr(word, i, 2) == "PH") {
    m = m "F"
    i++
    } else {
    m = m "P"
    }
    }
    else if (c == "Q") {
    m = m "K"
    }
    else if (c == "S") {
    if (substr(word, i, 2) == "SH") {
    m = m "X"
    i++
    } else if (substr(word, i, 3) == "TIA" || substr(word, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "S"
    }
    }
    else if (c == "T") {
    if (substr(word, i, 2) == "TH") {
    m = m "T"
    i++
    } else if (substr(word, i, 3) == "TIA" || substr(word, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "V") {
    m = m "F"
    }
    else if (c == "W" || c == "Y") {
    if (i < len && isvowel(next_c)) m = m c
    }
    else if (c == "X") {
    m = m "KS"
    }
    else if (c == "Z") {
    m = m "S"
    }
    # ensure 'M', 'N', and 'L' are always retained
    else if (c == "M" || c == "N" || c == "L") {
    m = m c
    }
    }

    return m
    }

    # -----------------------------------------------------------------

    function levenshtein(word1, word2, l1, l2, cst, i, j, diz) {
    l1 = length(word1)
    l2 = length(word2)

    # initialize distance array
    for (i = 0; i <= l1; i++) diz[i, 0] = i
    for (j = 0; j <= l2; j++) diz[0, j] = j

    # compute distance
    for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
    cst = (substr(word1, i, 1) == substr(word2, j, 1)) ? 0 : 1
    diz[i, j] = (diz[i-1, j] + 1 < diz[i, j-1] + 1) ? \
    (diz[i-1, j] + 1 < diz[i-1, j-1] + cst ? \
    diz[i-1, j] + 1 : diz[i-1, j-1] + cst) : \
    (diz[i, j-1] + 1 < diz[i-1, j-1] + cst ? \
    diz[i, j-1] + 1 : diz[i-1, j-1] + cst)
    }
    }

    return diz[l1, l2]
    }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to All on Tue Aug 20 11:33:33 2024
    Okay, essentially complete (enough for me at least).

    - added sorely needed 'TH' digraph handling

    - helper functions refactored

    - misc code cleanup

    # Metaphone Algorithm In AWK v4: Michael Sanders - 2024
    #
    # https://en.wikipedia.org/wiki/Metaphone
    #
    # example invocation: awk -f metaphone.awk -v find=butter < words.txt

    BEGIN { find_code = metaphone(find) }

    # -----------------------------------------------------------------

    # emit metaphone codes only
    # { for (x = 1; x <= NF; x++) print $x " : " metaphone($x) }

    # tweek levenshtein distance to open/constrain results...
    {

    for (x = 1; x <= NF; x++)
    if (metaphone($x) == find_code && levenshtein($x, find) <= 2)
    print $x

    }

    # -----------------------------------------------------------------

    function metaphone(w, m, c, n, z, i) {
    w = toupper(w)
    gsub(/[^A-Z]/, "", w) # strip non-alphabetic characters
    z = length(w)

    # handle initial letters
    if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
    w = substr(w, 2)
    z--
    }

    for (i = 1; i <= z; i++) {
    c = substr(w, i, 1)
    n = (i < z) ? substr(w, i + 1, 1) : ""

    # skip duplicate letters except for 'C'
    if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue

    # handle vowels: retain only if it's 1st letter
    if (isvowel(c)) {
    if (i == 1) m += c
    }
    # consonants
    else if (c == "B") {
    if (!(i == z && substr(w, i - 1, 1) == "M")) m += "B"
    }
    else if (c == "C") {
    if (substr(w, i, 2) == "CH") {
    m += "X"
    i++
    } else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) {
    m += "S"
    } else {
    m += "K"
    }
    }
    else if (c == "D") {
    if (substr(w, i, 2) == "DG" && substr(w, i + 2, 1) ~ /[IEY]/) {
    m += "J"
    i += 2
    } else {
    m += "T"
    }
    }
    else if (c == "G") {
    if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) {
    i++
    } else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) {
    continue
    } else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) {
    m += "J"
    } else {
    m += "K"
    }
    }
    else if (c == "H") {
    if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) {
    if (i < z && !isvowel(n)) {
    m += "H"
    }
    }
    }
    else if (c == "K") {
    if (i == 1 || substr(w, i - 1, 1) != "C") m += "K"
    }
    else if (c == "P") {
    if (substr(w, i, 2) == "PH") {
    m += "F"
    i++
    } else {
    m += "P"
    }
    }
    else if (c == "Q") {
    m += "K"
    }
    else if (c == "S") {
    if (substr(w, i, 2) == "SH") {
    m += "X"
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m += "X"
    i += 2
    } else {
    m += "S"
    }
    }
    else if (c == "T") {
    if (substr(w, i, 2) == "TH") {
    m += "0" # add '0' for 'TH' digraph to distinguish from regular 'T'
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m += "X"
    i += 2
    } else {
    m += "T"
    }
    }
    else if (c == "V") {
    m += "F"
    }
    else if (c == "W" || c == "Y") {
    if (i < z && isvowel(n)) m += c
    }
    else if (c == "X") {
    m += "KS"
    }
    else if (c == "Z") {
    m += "S"
    }
    # ensure 'M', 'N', and 'L' are always retained
    else if (c == "M" || c == "N" || c == "L") {
    m += c
    }
    }

    return m
    }

    # -----------------------------------------------------------------

    function levenshtein(w1, w2, l1, l2, i, j, cst, diz) {
    l1 = length(w1)
    l2 = length(w2)

    # initialize distance array
    for (i = 0; i <= l1; i++) diz[i, 0] = i
    for (j = 0; j <= l2; j++) diz[0, j] = j

    # compute distance
    for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
    cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1
    diz[i, j] = min3(diz[i-1, j] + 1, # deletion
    diz[i, j-1] + 1, # insertion
    diz[i-1, j-1] + cst) # substitution
    }
    }

    return diz[l1, l2]
    }

    # -----------------------------------------------------------------

    # metaphone helper function
    function isvowel(char) { return char ~ /[AEIOU]/ }

    # -----------------------------------------------------------------

    # levenshtein helper function
    function min3(a, b, c) {
    return (a < b ? (a < c ? a : c) : (b < c ? b : c))
    }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Mike Sanders on Wed Aug 21 00:58:06 2024
    porkchop@invalid.foo (Mike Sanders) writes:

    Ben Bacarisse <ben@bsb.me.uk> wrote:

    Using a word list, I found some odd matches. For example:

    $ echo "drunkeness indigestion" | awk -f metaphone.awk -v find=texas
    drunkeness
    indigestion

    Are these really metaphone matches for "texas"? It's possible (I don't
    know the algorithm at all well) but I found it surprising.

    Ben, give this try when you can. Finally starting to wrap my mind around
    its usage a little more...

    I don't know what your are asking for as this (your latest AWK) is not
    just an implementation of the metaphone algorithm. With the extra
    Levenshtein test it "texas" matches only a few words.

    However, if I remove the extra condition (that levenshtein($x, find) <=
    2) your AWK code matches a different set of words to the C
    implementation. Looking a bit deeper, your AWK code give the code TKSS
    to the word "texas" but the C code assigns is "TKS".

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Ben Bacarisse on Wed Aug 21 01:07:29 2024
    Ben Bacarisse <ben@bsb.me.uk> wrote:

    I don't know what your are asking for as this (your latest AWK) is not
    just an implementation of the metaphone algorithm. With the extra Levenshtein test it "texas" matches only a few words.

    Not seeking/asking for anything Ben, just enjoy the ride =)

    As for my Metaphone take... In fact it is. Several Metaphone variants
    use Levenshtein & can be any mixture of three types of Metaphone
    versions further still, or even a mix. Seems that's the way it is
    in the wild...

    However, if I remove the extra condition (that levenshtein($x, find) <=
    2) your AWK code matches a different set of words to the C
    implementation. Looking a bit deeper, your AWK code give the code TKSS
    to the word "texas" but the C code assigns is "TKS".

    Just differing metaphone variants, witness...

    Texas = Tex[ess] (if phonetically pronounced - almost slurred sounding)

    But hey: Many thanks for your input kind sir, I appreciate it.

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Mike Sanders on Wed Aug 21 02:50:07 2024
    Mike Sanders <porkchop@invalid.foo> wrote:

    Ben Bacarisse <ben@bsb.me.uk> wrote:

    However, if I remove the extra condition (that levenshtein($x, find) <=
    2) your AWK code matches a different set of words to the C
    implementation. Looking a bit deeper, your AWK code give the code TKSS
    to the word "texas" but the C code assigns is "TKS".

    Just differing metaphone variants, witness...

    Texas = Tex[ess] (if phonetically pronounced - almost slurred sounding)

    See also:

    https://tilores.io/metaphone-phonetic-algorithm-online-tool?t1=texas&t2=taxi

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to All on Wed Aug 21 02:42:07 2024
    just in case...

    not sure its wise to use 'm += var' with digits:

    m += string # valid
    m += "7" # may be invalid if its a digit (even if quoted)

    replaced all instances of: m += var
    with: m = m var

    any one know for sure?

    # Metaphone Algorithm In AWK v5: Michael Sanders - 2024
    #
    # https://en.wikipedia.org/wiki/Metaphone
    #
    # example invocation: awk -f metaphone.awk -v find=butter < words.txt

    BEGIN { find_code = metaphone(find) }

    # -----------------------------------------------------------------

    # emit metaphone codes only
    # { for (x = 1; x <= NF; x++) { print metaphone($x) }; exit }

    # tweek levenshtein distance to open/constrain results...
    {
    for (x = 1; x <= NF; x++)
    if (metaphone($x) == find_code && levenshtein($x, find) <= 2)
    print $x
    }

    # -----------------------------------------------------------------

    function metaphone(w, m, c, n, z, i) {
    # convert the word to uppercase and strip non-alphabetic characters
    w = toupper(w)
    gsub(/[^A-Z]/, "", w)
    z = length(w)

    # handle initial letters
    if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
    w = substr(w, 2)
    z--
    }

    for (i = 1; i <= z; i++) {
    c = substr(w, i, 1)
    n = (i < z) ? substr(w, i + 1, 1) : ""

    # skip duplicate letters except for 'C'
    if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue

    # handle vowels: retain only if it's 1st letter
    if (isvowel(c)) {
    if (i == 1) m = m c
    }
    # consonants
    else if (c == "B") {
    if (!(i == z && substr(w, i - 1, 1) == "M")) m = m "B"
    }
    else if (c == "C") {
    if (substr(w, i, 2) == "CH") {
    m = m "X"
    i++
    } else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) {
    m = m "S"
    } else {
    m = m "K"
    }
    }
    else if (c == "D") {
    if (substr(w, i, 2) == "DG" && isvowel(substr(w, i + 2, 1))) {
    m = m "J"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "G") {
    if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) {
    i++
    } else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) {
    continue
    } else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) {
    m = m "J"
    } else {
    m = m "K"
    }
    }
    else if (c == "H") {
    if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) {
    if (i < z && !isvowel(n)) {
    m = m "H"
    }
    }
    }
    else if (c == "K") {
    if (i == 1 || substr(w, i - 1, 1) != "C") m = m "K"
    }
    else if (c == "P") {
    if (substr(w, i, 2) == "PH") {
    m = m "F"
    i++
    } else {
    m = m "P"
    }
    }
    else if (c == "Q") {
    m = m "K"
    }
    else if (c == "S") {
    if (substr(w, i, 2) == "SH") {
    m = m "X"
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "S"
    }
    }
    else if (c == "T") {
    if (substr(w, i, 2) == "TH") {
    m = m "0" # add '0' for 'TH' digraph to distinguish from regular 'T'
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "V") {
    m = m "F"
    }
    else if (c == "W" || c == "Y") {
    if (i < z && isvowel(n)) m = m c
    }
    else if (c == "X") {
    m = m "KS"
    }
    else if (c == "Z") {
    m = m "S"
    }
    # ensure 'M', 'N', and 'L' are always retained
    else if (c == "M" || c == "N" || c == "L") {
    m = m c
    }
    }

    return m
    }

    # -----------------------------------------------------------------

    function levenshtein(w1, w2, l1, l2, i, j, cst, diz) {
    l1 = length(w1)
    l2 = length(w2)

    # initialize distance array
    for (i = 0; i <= l1; i++) diz[i, 0] = i
    for (j = 0; j <= l2; j++) diz[0, j] = j

    # compute distance
    for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
    cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1
    diz[i, j] = min3(diz[i-1, j] + 1, # deletion
    diz[i, j-1] + 1, # insertion
    diz[i-1, j-1] + cst) # substitution
    }
    }

    return diz[l1, l2]
    }

    # -----------------------------------------------------------------

    # metaphone helper function
    function isvowel(char) { return char ~ /[AEIOU]/ }

    # -----------------------------------------------------------------

    # levenshtein helper function
    function min3(a, b, c) { return (a < b ? (a < c ? a : c) : (b < c ? b : c)) }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Kenny McCormack@21:1/5 to Mike Sanders on Wed Aug 21 03:13:38 2024
    In article <va3k5u$3n2um$1@dont-email.me>,
    Mike Sanders <porkchop@invalid.foo> wrote:
    just in case...

    not sure its wise to use 'm += var' with digits:

    m += string # valid
    m += "7" # may be invalid if its a digit (even if quoted)

    In AWK, these are very different things.

    += is always arithmetic; it is not string concatenation at all.

    --
    Christianity is not a religion.

    - Rick C Hodgin -

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Kenny McCormack on Wed Aug 21 05:32:49 2024
    Kenny McCormack <gazelle@shell.xmission.com> wrote:

    In article <va3k5u$3n2um$1@dont-email.me>,
    Mike Sanders <porkchop@invalid.foo> wrote:
    just in case...

    not sure its wise to use 'm += var' with digits:

    m += string # valid
    m += "7" # may be invalid if its a digit (even if quoted)

    In AWK, these are very different things.

    += is always arithmetic; it is not string concatenation at all.

    Ahh... Many thanks as always Kenny. When our brains are uploaded
    to the great cyborg in the sky in the future, I'll be looking
    for a copy of your awk knowledge.

    See elsewhere in this group a soon to be posted question titled:

    '{} Questions'

    Wondering if what I'm thinking is good/bad logic...

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Ben Bacarisse@21:1/5 to Mike Sanders on Wed Aug 21 09:15:45 2024
    porkchop@invalid.foo (Mike Sanders) writes:

    Ben Bacarisse <ben@bsb.me.uk> wrote:

    I don't know what your are asking for as this (your latest AWK) is not
    just an implementation of the metaphone algorithm. With the extra
    Levenshtein test it "texas" matches only a few words.

    Not seeking/asking for anything Ben, just enjoy the ride =)

    As for my Metaphone take... In fact it is. Several Metaphone variants
    use Levenshtein & can be any mixture of three types of Metaphone
    versions further still, or even a mix. Seems that's the way it is
    in the wild...

    There are certainly variants (and developments of the original) but I
    thought you were implementing the same algorithm as the code you
    referenced. Sorry for the confusion.

    --
    Ben.

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to Ben Bacarisse on Wed Aug 21 19:13:59 2024
    Ben Bacarisse <ben@bsb.me.uk> wrote:

    There are certainly variants (and developments of the original) but I
    thought you were implementing the same algorithm as the code you
    referenced. Sorry for the confusion.

    No worries =) Your suspicions are not unfounded: I started
    with the same, its just well, terrible IMO, the BASIC is
    readable but the more or less the same.

    Mainly just wanted the instructions the gist link had.
    At this point I've studied PHP, javascript, python, etc.
    My AWK implementation is starting to tighten up considerably.

    Read/study from those that shared & paying it forward - its all good.

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to All on Wed Aug 21 19:03:53 2024
    Confirmed a suspicion I had: Cosine Similarity is faster
    than Levenshtein Distance but my testing is informal at best.

    Also a new option that outputs to CSV, screenshot of file
    loaded into spreadsheet here:

    https://drive.google.com/file/d/1UYDSTWzSvNoR8oKd2IqazKMQb-OcAlI1/view?usp=sharing

    DEBUG { code } coming too.

    Really exited about this project =)

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)
  • From Mike Sanders@21:1/5 to All on Fri Aug 23 06:13:17 2024
    Final iteration (for me). Please post any improvements in this group.

    # Metaphone Algorithm in AWK v6: Michael Sanders - 2024
    # usage example: echo poluphloisboiotatotic | awk -f metaphone.awk
    # valid output should be: POLUPHLOISBOIOTATOTIC : PLFLSBTTTK
    # see also: https://en.wikipedia.org/wiki/Metaphone

    { print $0 " : " metaphone($0) }

    function isvowel(char) { return char ~ /[AEIOU]/ }

    function metaphone(w, m, c, n, z, i) {

    w = toupper(w)
    # strip non-alphabetic characters
    gsub(/[^A-Z]/, "", w)
    z = length(w)

    # handle initial letters
    if (substr(w, 1, 2) ~ /^(KN|GN|PN|WR|PS)/) {
    w = substr(w, 2)
    z--
    }

    for (i = 1; i <= z; i++) {
    c = substr(w, i, 1)
    n = (i < z) ? substr(w, i + 1, 1) : ""

    # skip duplicate letters except for 'C'
    if (i > 1 && c == substr(w, i - 1, 1) && c != "C") continue

    # handle vowels: retain only if it's the 1st letter
    if (isvowel(c)) {
    if (i == 1) m = m c
    }
    # consonants...
    else if (c == "B") {
    if (!(i == z && substr(w, i - 1, 1) == "M")) m = m "B"
    }
    else if (c == "C") {
    if (substr(w, i, 2) == "CH") {
    m = m "X"
    i++
    } else if (substr(w, i, 2) ~ /^(CI|CE|CY)/) {
    m = m "S"
    } else {
    m = m "K"
    }
    }
    else if (c == "D") {
    if (substr(w, i, 2) == "DG" && isvowel(substr(w, i + 2, 1))) {
    m = m "J"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "F") { # Handling for 'F'
    m = m "F"
    }
    else if (c == "G") {
    if (substr(w, i, 2) == "GH" && (i == 1 || !isvowel(substr(w, i - 1, 1)))) {
    i++
    } else if (substr(w, i, 2) == "GN" || (i == z && c == "G")) {
    continue
    } else if (substr(w, i, 3) ~ /^(GIA|GIE|GEY)/) {
    m = m "J"
    } else {
    m = m "K"
    }
    }
    else if (c == "H") {
    if (i == 1 || substr(w, i - 1, 1) !~ /[CSPTG]/) {
    if (i < z && !isvowel(n)) {
    m = m "H"
    }
    }
    }
    else if (c == "K") {
    if (i == 1 || substr(w, i - 1, 1) != "C") m = m "K"
    }
    else if (c == "P") {
    if (substr(w, i, 2) == "PH") {
    m = m "F"
    i++
    } else {
    m = m "P"
    }
    }
    else if (c == "Q") {
    m = m "K"
    }
    else if (c == "R") {
    if (i == 1 || isvowel(substr(w, i - 1, 1))) {
    m = m "R"
    }
    }
    else if (c == "S") {
    if (substr(w, i, 2) == "SH") {
    m = m "X"
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "S"
    }
    }
    else if (c == "T") {
    if (substr(w, i, 2) == "TH") {
    m = m "0" # add '0' for 'TH' digraph to distinguish from regular 'T'
    i++
    } else if (substr(w, i, 3) == "TIA" || substr(w, i, 3) == "TIO") {
    m = m "X"
    i += 2
    } else {
    m = m "T"
    }
    }
    else if (c == "V") {
    m = m "F"
    }
    else if (c == "W" || c == "Y") {
    if (i < z && isvowel(n)) m = m c
    }
    else if (c == "X") {
    m = m "KS"
    }
    else if (c == "Z") {
    m = m "S"
    }
    # handle M, N, L, J, G more generally
    else if (c == "M" || c == "N" || c == "L" || c == "J" || c == "G") {
    m = m c
    }
    }

    return m
    }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)