• Re: Tokenizer (Re: Most efficient way to read words from string.)

    From B. Pym@Nobody447095@here-nor-there.org to comp.lang.lisp,comp.lang.scheme on Tue Aug 26 09:04:18 2025
    From Newsgroup: comp.lang.lisp

    (defun string-split (str &optional (separator #\Space))
    "Splits the string STR at each SEPARATOR character occurrence.
    The resulting substrings are collected into a list which is returned.
    A SEPARATOR at the beginning or at the end of the string STR results
    in an empty string in the first or last position of the list
    returned."
    (declare (type string str)
    (type character separator))
    (loop for start = 0 then (1+ end)
    for end = (position separator str :start 0)
    then (position separator str :start start)
    for substr = (subseq str start end)
    then (subseq str start end)
    collect substr into result
    when (null end) do (return result)
    ))


    Testing:

    * (string-split " foo bar ")

    ("" "foo" "" "" "" "" "" "bar" "")


    Gauche Scheme

    "!" is similar to "do".

    (define (tokenize str separators)
    (let ((seps (string->list separators)))
    (! (ch :in (reverse (cons (car seps) (string->list str)))
    := sep (member ch seps)
    r cons (list->string tmp) :if (and (pair? tmp) sep)
    tmp '() (if sep '() (cons ch tmp)))
    #f r)))

    (tokenize " foo; bar, baz, and ... zap" " ,;.")
    ===>
    ("foo" "bar" "baz" "and" "zap")

    Given:

    (define-syntax !-aux
    (syntax-rules (<> @ @@ + - cons cdr :in :across :along
    :if :if-else !
    :let := )
    [(_ (:let id val z ...) seen (lets ...) stuff ...)
    (!-aux (z ...) seen (lets ... (id val)) stuff ...) ]
    [(_ (:= id val z ...) stuff ...)
    (!-aux (:let id #f dummy #f (set! id val) z ...) stuff ...) ]
    [(_ (:if bool z ...) (seen ... (v i u)) stuff ...)
    (!-aux (z ...)
    (seen ... (v i (if bool u v))) stuff ...) ]
    ;;
    [(_ (:if-else bool z ...) (seen ... (a b c)(d e f)) stuff ...)
    (!-aux (:let yes #f z ...)
    (seen ... (a b (begin (set! yes bool) (if yes c a)))
    (d e (if (not yes) f d))) stuff ...) ]
    ;;
    [(_ (x :in lst z ...) seen lets bool stuff ...)
    (!-aux (:let xs lst x (and (pair? xs)(pop! xs)) <> z ...)
    seen lets (or (not x) bool) stuff ...) ]
    [(_ (x :across vec z ...) seen lets bool stuff ...)
    (!-aux (:let v vec :let i 0
    x (and (< i (vector-length v))
    (begin0 (vector-ref v i) (inc! i))) <>
    z ...)
    seen lets (or (not x) bool) stuff ...) ]
    [(_ (ch :along str z ...) seen lets bool stuff ...)
    (!-aux (:let s str :let i 0
    ch (and (< i (string-length s))
    (begin0 (string-ref s i) (inc! i))) <>
    z ...)
    seen lets (or (not ch) bool) stuff ...) ]
    [(_ (a b <> z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a b b)) stuff ...) ]
    [(_ (a b + z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a b (+ 1 a))) stuff ...) ]
    [(_ (a + n z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a 0 (+ n a))) stuff ...) ]
    [(_ (a b - z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a b (- a 1))) stuff ...) ]
    [(_ (a cons b z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a '() (cons b a))) stuff ...) ]
    [(_ (a b cdr z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a b (cdr a))) stuff ...) ]
    [(_ (a b c z ...) (seen ...) stuff ...)
    (!-aux (z ...) (seen ... (a b c)) stuff ...) ]
    [(_ (a b) (seen ...) stuff ...)
    (!-aux () (seen ... (a b)) stuff ...) ]
    [(_ (a) (seen ...) stuff ...)
    (!-aux () (seen ... (a '())) stuff ...) ]
    ;;
    [(_ () seen lets bool ! action ...)
    (!-aux () seen lets bool #t (action ...)) ]
    ;;
    [(_ () ((a b c) z ...) lets bool)
    (!-aux () ((a b c) z ...) lets bool a) ]
    [(_ () ((a b c) z ...) lets bool @)
    (!-aux () ((a b c) z ...) lets bool (reverse a)) ]
    [(_ () seen lets bool @ result stuff ...)
    (!-aux () seen lets bool (reverse result) stuff ...) ]
    ;;
    [(_ () seen lets bool @@ (what x ...) stuff ...)
    (!-aux () seen lets bool (what (reverse x) ...) stuff ...) ]
    [(_ () seen lets bool (what @ x z ...) stuff ...)
    (!-aux () seen lets bool (what (reverse x) z ...) stuff ...) ]
    [(_ () seen lets bool (what x @ y z ...) stuff ...)
    (!-aux () seen lets bool (what x (reverse y) z ...) stuff ...) ]
    ;;
    [(_ () ((a b c) z ...) lets 0 stuff ...)
    (!-aux () ((a b c) z ...) lets (= 0 a) stuff ...) ]
    [(_ () seen lets bool result stuff ...)
    (let lets (do seen (bool result) stuff ...)) ]
    ))
    (define-syntax !
    (syntax-rules ()
    [(_ specs bool stuff ...)
    (!-aux specs () () bool stuff ...) ]
    [(_ specs) (! specs #f) ]
    ))
    --
    [T]he problem is that lispniks are as cultish as any other devout group and basically fall down frothing at the mouth if they see [heterodoxy].
    --- Kenny Tilton
    The good news is, it's not Lisp that sucks, but Common Lisp. --- Paul Graham --- Synchronet 3.21a-Linux NewsLink 1.2
  • From tpeplt@tpeplt@gmail.com to comp.lang.lisp,comp.lang.scheme on Tue Aug 26 15:02:11 2025
    From Newsgroup: comp.lang.lisp


    (defun string-split (str &optional (separator #\Space))
    "Splits the string STR at each SEPARATOR character occurrence.
    The resulting substrings are collected into a list which is returned.
    A SEPARATOR at the beginning or at the end of the string STR results
    in an empty string in the first or last position of the list
    returned."
    (declare (type string str)
    (type character separator))
    (loop for start = 0 then (1+ end)
    for end = (position separator str :start 0)
    then (position separator str :start start)
    for substr = (subseq str start end)
    then (subseq str start end)
    collect substr into result
    when (null end) do (return result)
    ))


    1. With a for-as-equals-then clause, if the rCythenrCO FORM2 is
    omitted, then the FORM1 is evaluated for each iteration.
    So, if FORM2 is identical to FORM1, it may be omitted.

    See: http://www.ai.mit.edu/projects/iiip/doc/CommonLISP/HyperSpec/Body/sec_6-1-2-1-4.html

    2. rCypositionrCO returns NIL if the item is not found in the
    sequence, so it is necessary to check the result of a call
    to this function before using it with arithmetic functions.

    Here is an alternative version of the function
    rCystring-splitrCO:

    (defun string-split (str &optional (separator #\Space))
    "Splits the string STR at each SEPARATOR character occurrence.
    The resulting substrings are collected into a list which is returned.
    A SEPARATOR at the beginning or at the end of the string STR results
    in an empty string in the first or last position of the list
    returned."
    (declare (type string str)
    (type character separator))
    (loop
    with end = 0
    for start = (position separator str
    :start end :test-not #'equal)
    if start
    do (setf end (or (position separator str :start start)
    (length str)))
    and collect (subseq str start end)
    until (null start)))

    Some tests:

    (string-split " ")
    NIL

    (string-split "stringtolist")
    ("stringtolist")

    (string-split "stringtolist ")
    ("stringtolist")

    (string-split " stringtolist")
    ("stringtolist")

    (string-split " stringtolist ")
    ("stringtolist")

    (string-split " string to list ")
    ("string" "to" "list")

    (string-split " string to list")
    ("string" "to" "list")

    (string-split "string to list ")
    ("string" "to" "list")


    Gauche Scheme

    "!" is similar to "do".

    (define (tokenize str separators)
    (let ((seps (string->list separators)))
    (! (ch :in (reverse (cons (car seps) (string->list str)))
    := sep (member ch seps)
    r cons (list->string tmp) :if (and (pair? tmp) sep)
    tmp '() (if sep '() (cons ch tmp)))
    #f r)))

    (tokenize " foo; bar, baz, and ... zap" " ,;.")
    ===>
    ("foo" "bar" "baz" "and" "zap")


    3. Here is a generalized version of rCystring-splitrCO,
    supporting a string of separators as an argument and using
    Common LisprCOs rCyposition-ifrCO/rCyposition-if-notrCO functions in
    place of rCypositionrCO:

    ```
    (defun string-split (str &optional (separators " ;,."))
    "Splits the string STR at each SEPARATOR character occurrence.
    The resulting substrings are collected into a list which is returned.
    A SEPARATOR at the beginning or at the end of the string STR results
    in an empty string in the first or last position of the list
    returned."
    (declare (type string str)
    (type string separators))
    (loop
    with bag = (coerce separators 'list) and end = 0
    for start = (position-if-not (lambda (ch) (member ch bag)) str
    :start end)
    if start
    do (setf end (or (position-if (lambda (ch) (member ch bag)) str
    :start start)
    (length str)))
    and collect (subseq str start end)
    until (null start)))
    ```

    Some tests:

    (string-split " " "- ")
    NIL

    (string-split " " "-;")
    (" ")

    (string-split "string-to-list" "- ")
    ("string" "to" "list")

    (string-split "string-to-list " "- ")
    ("string" "to" "list")

    (string-split " string-to-list" "- ")
    ("string" "to" "list")

    (string-split " string-to-list " "- ")
    ("string" "to" "list")

    (string-split " string-to-list " "- ")
    ("string" "to" "list")

    (string-split " string-to-list" "- ")
    ("string" "to" "list")

    (string-split "string-to-list " "- ")
    ("string" "to" "list")
    --
    The lyf so short, the craft so long to lerne.
    - Geoffrey Chaucer, The Parliament of Birds.
    --- Synchronet 3.21a-Linux NewsLink 1.2