• Detecting English Rhyme Words In AWK

    From Mike Sanders@21:1/5 to All on Wed Aug 21 01:12:07 2024
    1st attempt...

    # An attempt at detecting english rhyme words in AWK Michael Sanders - 2024
    #
    # example invocation: awk -f rhyme.awk -v rhyme=skipper < words.txt

    # adjust levenshtein distance to widen/constrain results
    { if (isrhyme($1, rhyme) && levenshtein($1, rhyme) < 2) print $1 }

    # -----------------------------------------------------------------

    function isrhyme(w1, w2, e1, e2) {
    # lowercase words for consistency
    w1 = tolower(w1)
    w2 = tolower(w2)

    # if the general rule applies (matching vowel sounds), return 1
    e1 = lvcs(w1)
    e2 = lvcs(w2)
    if (e1 == e2) return 1

    # specific exceptions for common english rhyming patterns
    if (ismatch(w1, w2, /(ight|ite)$/)) return 1 # tight/lite
    if (ismatch(w1, w2, /(air|ear)$/)) return 1 # hair/bear
    if (ismatch(w1, w2, /(ain|ane)$/)) return 1 # plain/lane
    if (ismatch(w1, w2, /(ought|aught)$/)) return 1 # bought/caught
    if (ismatch(w1, w2, /(ell|ale)$/)) return 1 # sell/pale
    if (ismatch(w1, w2, /(ound|own)$/)) return 1 # round/crown
    if (ismatch(w1, w2, /(en|in)$/)) return 1 # pen/tin
    if (ismatch(w1, w2, /(ow|ou)$/)) return 1 # cow/plough
    if (ismatch(w1, w2, /(ine|ign)$/)) return 1 # wine/sign

    # common word endings that may not exactly match yet rhyme phonetically
    if (ismatch(w1, w2, /(tion|sion)$/)) return 1 # nation/mission
    if (ismatch(w1, w2, /(tion|cian)$/)) return 1 # station/magician
    if (ismatch(w1, w2, /(able|ible)$/)) return 1 # capable/responsible

    # special cases for shorter vowel/consonant combinations
    if (ismatch(w1, w2, /(op|up)$/)) return 1 # hop/cup
    if (ismatch(w1, w2, /(ap|ep)$/)) return 1 # clap/step
    if (ismatch(w1, w2, /(ad|ed)$/)) return 1 # bad/red
    if (ismatch(w1, w2, /(ox|ock)$/)) return 1 # box/rock

    # catch-all...
    if (ismatch(w1, w2, /(ot|ote|oat)$/)) return 1 # note/boat
    if (ismatch(w1, w2, /(upe|oop)$/)) return 1 # dupe/poop

    return 0
    }

    # -----------------------------------------------------------------

    function levenshtein(w1, w2, l1, l2, i, j, cst, diz) {
    l1 = length(w1)
    l2 = length(w2)

    # initialize distance array
    for (i = 0; i <= l1; i++) diz[i, 0] = i
    for (j = 0; j <= l2; j++) diz[0, j] = j

    # compute distance
    for (i = 1; i <= l1; i++) {
    for (j = 1; j <= l2; j++) {
    cst = (substr(w1, i, 1) == substr(w2, j, 1)) ? 0 : 1
    diz[i, j] = min3(diz[i-1, j] + 1, # deletion
    diz[i, j-1] + 1, # insertion
    diz[i-1, j-1] + cst) # substitution
    }
    }

    return diz[l1, l2]
    }

    # -----------------------------------------------------------------

    # levenshtein helper: returns minimum of 3 numbers
    function min3(a, b, c) { return (a < b ? (a < c ? a : c) : (b < c ? b : c)) }

    # -----------------------------------------------------------------

    # rhyme helper: returns last vowel-consonant sequence k'ite', educat'ion' function lvcs(w) {
    # start from end of word looking for last vowel
    for (i = length(w); i > 0; i--) {
    if (substr(w, i, 1) ~ /[aeiouy]/) return substr(w, i) # last vowel to end
    }
    return "" # return empty string if no vowel is found
    }

    # -----------------------------------------------------------------

    # rhyme helper: true if words match given pattern '(ight|ite)$'
    function ismatch(w1, w2, p) { return match(w1, p) && match(w2, p) }

    # eof

    --
    :wq
    Mike Sanders

    --- SoupGate-Win32 v1.05
    * Origin: fsxNet Usenet Gateway (21:1/5)