forked from ruricolist/serapeum
-
Notifications
You must be signed in to change notification settings - Fork 0
/
strings.lisp
961 lines (852 loc) · 37.9 KB
/
strings.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
(in-package :serapeum)
(deftype string-offset ()
'(integer #.(- array-dimension-limit) #.array-dimension-limit))
(defconst no-break-space
#-(or abcl lispworks) #\No-break_space
#+(or abcl lispworks) (code-char 160))
(defconst whitespace
#.(remove-duplicates
(coerce (list #\Space #\Tab #\Linefeed #\Return #\Newline #\Page
#\Vt ;Vertical tab.
no-break-space)
'string))
"Whitespace characters.")
(defsubst whitespacep (char)
"Is CHAR whitespace?
Spaces, tabs, any kind of line break, page breaks, and no-break spaces
are considered whitespace."
(case (char-code char)
(#.(map 'list #'char-code whitespace) t)))
(defsubst trim-whitespace (string)
"STRING without whitespace at ends."
(string-trim whitespace string))
(defsubst ascii-char-p (char)
"Is CHAR an ASCII char?"
(and (< (char-code char) 128)
char))
(defun call/string (fn stream)
"Resolve STREAM like `format' and call FN with the result."
(fbind (fn)
(etypecase stream
((eql t) (fn *standard-output*))
((eql nil)
(with-output-to-string (s)
(fn s)))
((and string (not simple-string))
(with-output-to-string (s stream)
(fn s)))
(output-stream (fn stream)))))
(defmacro with-string ((var &optional stream) &body body)
"Bind VAR to the character stream designated by STREAM.
STREAM is resolved like the DESTINATION argument to `format': it can
be any of t (for `*standard-output*'), nil (for a string stream), a
string with a fill pointer, or a stream to be used directly.
When possible, it is a good idea for functions that build strings to
take a stream to write to, so callers can avoid consing a string just
to write it to a stream. This macro makes it easy to write such
functions.
(defun format-x (x &key stream)
(with-string (s stream)
...))"
(when (constantp stream)
(let ((stream (eval stream)))
(cond ((eql stream t)
(return-from with-string
`(let ((,var *standard-output*))
,@body)))
((null stream)
(return-from with-string
`(with-output-to-string (,var)
,@body))))))
(with-thunk (body var)
`(call/string #',body ,stream)))
(defsubst blankp (seq)
"SEQ is either empty, or consists entirely of characters that
satisfy `whitespacep'."
(every #'whitespacep seq))
(defun collapse-whitespace (string &key (space #\Space) stream)
"Collapse runs of whitespace in STRING.
Each run of space, newline, and other whitespace characters is
replaced by a single space character (or SPACE, if that is specified)."
(declare (inline position whitespacep))
(check-type string string)
(check-type space character)
(with-string (s stream)
(with-string-dispatch () string
(let ((len (length string)))
(nlet rec ((i 0))
(unless (= i len)
(let ((char (vref string i)))
(cond ((whitespacep char)
(write-char space s)
(rec (or (position-if-not #'whitespacep string
:start i)
len)))
(t
(let ((j (or (position-if #'whitespacep string
:start i)
len)))
(write-string string s :start i :end j)
(rec j)))))))))))
(-> simplify-string (string) (simple-array character (*)))
(defun simplify-string (string)
(declare (string string))
(coerce string '(simple-array character (*))))
(-> copy-string (string-designator) string)
(defun copy-string (x)
(copy-seq (string x)))
(defsubst concat (&rest strings)
"Abbreviation for (concatenate 'string ...).
From Emacs Lisp."
(apply #'concatenate 'string strings))
(defun mapconcat/list (fun list sep stream)
(declare (list list) (function fun) (string sep) (optimize speed))
(loop for (elt . more?) on list
do (write-string (funcall fun elt) stream)
if more?
do (write-string sep stream)))
(defun mapconcat/seq (fun seq sep stream)
(declare (function fun) (string sep))
(if (emptyp seq)
(make-string 0)
(let ((i 0)
(ult (1- (length seq))))
(declare (type array-index i ult))
(do-each (elt seq)
(write-string (funcall fun elt) stream)
(unless (= (prog1 i (incf i)) ult)
(write-string sep stream))))))
(defun mapconcat (fun seq separator &key stream)
"Build a string by mapping FUN over SEQ.
Separate each value with SEPARATOR.
Equivalent to
(reduce #'concat (intersperse SEP SEQ) :key FUN)
but more efficient.
STREAM can be used to specify a stream to write to. It is resolved
like the first argument to `format'.
From Emacs Lisp."
(values
(if (emptyp seq)
(make-string 0)
(let ((fun (ensure-function fun)))
(check-type separator string)
(with-string (stream stream)
(seq-dispatch seq
(mapconcat/list fun seq separator stream)
(mapconcat/seq fun seq separator stream)))))))
(defun string-join (strings &optional (separator ""))
"Like `(mapconcat #'string STRINGS (string SEPARATOR))'."
(mapconcat #'string strings (string separator)))
(-> string-upcase-initials (string-designator) string)
(defun string-upcase-initials (string)
"Return STRING with the first letter of each word capitalized.
This differs from STRING-CAPITALIZE in that the other characters in
each word are not changed.
(string-capitalize \"an ACRONYM\") -> \"An Acronym\")
(string-upcase-initials \"an ACRONYM\") -> \"An ACRONYM\")
From Emacs Lisp (where it is simply `upcase-initials')."
(nstring-upcase-initials (copy-string string)))
(-> nstring-upcase-initials (string-designator) string)
(defun nstring-upcase-initials (string)
"Destructive version of `string-upcase-initials'."
(lret ((string (string string)))
(with-string-dispatch () string
(when (= (length string) 0)
(return-from nstring-upcase-initials
string))
(setf (vref string 0)
(char-upcase (vref string 0)))
(loop for i from 0
for j from 1 below (length string)
for x = (vref string i)
for y = (vref string j)
when (and (not (alphanumericp x))
(alphanumericp y))
do (setf (vref string j)
(char-upcase y))))))
;;; https://groups.google.com/d/msg/comp.lang.lisp/EO1mZBtiXX0/JuuhKJ6eMHIJ
;;; https://groups.google.com/d/msg/comp.lang.lisp/0CSkbAd8NTg/UnHQf9YIf8kJ
(defun same-case-p (string)
"Every character with case in STRING has the same case.
Return `:upper' or `:lower' as appropriate."
(let ((string (string string)))
(with-string-dispatch () string
(let ((length (length string)))
(declare (array-length length))
(nlet invert ((i 0)
(ucp nil)
(lcp nil))
(declare (array-length i))
(if (= i length)
(cond ((eq ucp lcp) nil)
(ucp :upper)
(lcp :lower))
(let ((char (vref string i)))
(cond ((upper-case-p char)
(invert (1+ i) t lcp))
((lower-case-p char)
(invert (1+ i) ucp t))
(t (invert (1+ i) ucp lcp))))))))))
(-> nstring-invert-case (string-designator) string)
(defun nstring-invert-case (string)
"Destructive version of `string-invert-case'."
(let ((string (string string)))
(case (same-case-p string)
(:upper (nstring-downcase string))
(:lower (nstring-upcase string))
(t string))))
(-> string-invert-case (string-designator) string)
(defun string-invert-case (string)
"Invert the case of STRING.
This does the same thing as a case-inverting readtable:
- If the string is uppercase, downcase the string.
- If the string is lowercase, upcase the string.
- If the string is mixed-case, leave it alone."
(nstring-invert-case (copy-string string)))
(-> words (string &key (:start array-index) (:end (or array-index null)))
list)
(defun words (string &key (start 0) end)
"Split STRING into words.
The definition of a word is the same as that used by
`string-capitalize': a run of alphanumeric characters.
(words \"Four score and seven years\")
=> (\"Four\" \"score\" \"and\" \"seven\" \"years\")
(words \"2 words\")
=> (\"2\" \"words\")
(words \"two_words\")
=> (\"two\" \"words\")
(words \"\\\"I'm here,\\\" Tom said presently.\")
=> (\"I\" \"m\" \"here\" \"Tom\" \"said\" \"presently\")
Cf. `tokens'."
(declare (string string)
(optimize speed))
(values (split-sequence-if-not #'alphanumericp
string
:remove-empty-subseqs t
:start start
:end end)))
(-> tokens (string &key (:start array-index) (:end (or array-index null)))
list)
(defun tokens (string &key (start 0) end)
"Separate STRING into tokens.
Tokens are runs of non-whitespace characters.
(tokens \"\\\"I'm here,\\\" Tom said presently.\")
=> (\"\\\"I'm\" \"here,\\\"\" \"Tom\" \"said\" \"presently.\")
Cf. `words'."
(declare (string string))
(values (split-sequence-if #'whitespacep
string
:remove-empty-subseqs t
:start start
:end end)))
(defun word-wrap (string &key (column 80) stream)
"Return a word-wrapped version of STRING that breaks at COLUMN.
Note that this is not a general-purpose word-wrapping routine like you
would find in a text editor: in particular, any existing whitespace is
removed."
(let ((col 0))
(with-string (s stream)
(loop for (token . more) on (tokens string) do
(flet ((reset ()
(setq col 0)
(terpri s))
(output-word (word)
(write-string word s)
(incf col (length word))
(when more
(write-char #\Space s)
(incf col))))
(let ((projected-length (+ col (length token))))
(if (<= projected-length column)
(output-word token)
(progn
(reset)
(output-word token)))))))))
(-> lines ((or null string)
&key
(:eol-style (or (member nil :cr :lf :crlf :ascii :unicode)
function))
(:honor-crlf t)
(:keep-eols t)
(:count (or null (integer 0 *))))
(values (or null list) &optional))
(defun lines (string &key eol-style (honor-crlf nil honor-crlf-p) keep-eols count)
"Return a list of the lines in STRING, stripped of any EOL characters
and including the last nonempty line even if it has no EOL characters,
or NIL if STRING is empty or NIL.
If COUNT is provided, only the first COUNT lines are returned.
EOL-STYLE can be one of the following:
- NIL, the default, which means split on #\\Newline.
- :CR, which means split on CR, i.e., #\\Return.
- :LF, which means split on LF, i.e., #\\Linefeed.
- :CRLF, which means split on CRLF, i.e., #\\Return followed by
#\\Linefeed.
- :ASCII, which means split on any of CR, LF, and CRLF.
- :UNICODE, which means split on any of the newlines described in
Section 5.8, \"Newline Guidelines\", of the Unicode Standard,
available at http://www.unicode.org/versions/latest/.
These newlines are CR, LF, CRLF, next line, vertical tab, form feed,
line separator, and paragraph separator.
- A predicate that accepts one CHARACTER and returns non-NIL if the
CHARACTER should be split on, NIL otherwise.
:CR, :LF, :CRLF, and :ASCII assume that the Common Lisp implementation
represents CHARACTERs internally as ASCII or one of its supersets
\(e.g., extended ASCII), and :UNICODE assumes that it represents them
internally as Unicode \(which is also a superset of ASCII).
Additionally, all of the EOL-STYLEs just mentioned assume that #\\Newline
is either #\\Return or #\\Linefeed \(which can be reasonably expected).
If HONOR-CRLF is supplied, it overrides EOL-STYLE's interpretation of
CRLF except if EOL-STYLE is NIL or :CRLF, in which case HONOR-CRLF has
no effect.
\(The :CRLF, :ASCII and :UNICODE EOL-STYLEs honor CRLF by default; the
rest do not.)
If KEEP-EOLS is non-NIL, LINES does not strip the EOL characters from
the lines.
Note that Common Lisp implementations may convert some or all of CR, LF,
and CRLF to #\\Newline when reading from file streams, which causes
LINES to split the contents of files differently across implementations.
:CR, :LF, and :CRLF are suitable only when STRING's lines certainly end
with the corresponding EOL character, but if STRING originates from a
file stream, LINES splits nothing unless the corresponding EOL character
is the same as #\\Newline, in which case LINES behaves as if EOL-STYLE
were NIL \(and indeed NIL is preferable to :CR, :LF, and :CRLF, though
not to :ASCII and :UNICODE).
:UNICODE and :ASCII are the preferred EOL-STYLEs, the former to be
maximally portable and correct, and the latter when Unicode is inapt.
With either EOL-STYLE, LINES splits the entire contents of files
correctly only when the Common Lisp implementation converts only CR,
only LF, or all of CR, LF, and CRLF, to #\\Newline \(and when it
converts only CR or only LF, #\\Newline must the same as the EOL
character in question).
Again with either EOL-STYLE, LINES splits the lines of files, read with
READ-LINE, correctly only when the implementation converts only LF or
all of CR, LF, and CRLF to #\\Newline \(which must be #\\Linefeed).
\(Note the lack of the only-CR case when reading files line by line.)
However, any incorrect behavior with :ASCII and :UNICODE is limited to
LINES returning too many or too few empty lines.
The former -- which is uncorrectable -- can occur when CR and LF are
converted, but not CRLF, and the latter -- which can be corrected by
supplying HONOR-CRLF as NIL -- when CR and CRLF are converted \(to
#\\Return), but not LF, or when LF and CRLF are converted \(to
#\\Linefeed), but not CR.
For example, to split lines on LF and CRLF \(eschewing the recommended
:ASCII and :UNICODE) when the Common Lisp implementation converts only
LF to #\\Newline \(which must be #\\Linefeed), which is the same
behavior as Rust's std::io::BufRead.lines
\(https://doc.rust-lang.org/std/io/trait.BufRead.html#method.lines) and
Go's bufio.ScanLines \(https://golang.org/pkg/bufio/#ScanLines):
#.(ecase #\\Newline (#\\Linefeed))
(let ((string (coerce '(#\\a #\\Return
#\\b #\\Linefeed
#\\c #\\Return #\\Linefeed
#\\d)
'string)))
(serapeum:lines string :eol-style :lf :honor-crlf t))
=> (\"a^Mb\" \"c\" \"d\")
;; where ^M is #\\Return.
\(EOL-STYLE cannot be NIL here because otherwise HONOR-CRLF would have
no effect.)
To split lines in the same way as Python's str.splitlines
\(https://docs.python.org/3/library/stdtypes.html#str.splitlines) when
the Common Lisp implementation converts only CR, only LF, or all of CR,
LF, and CRLF, to #\\Newline \(as previously described), but also keeping
the EOL characters in order to know what they were:
#.(ecase #\\Newline ((#\\Return #\\Linefeed)))
;; Omit file separator from the example because its textual
;; representation (^\\) can confuse documentation browsers.
(let ((string (coerce '(#\\a #.(code-char #x001D)
#\\b #.(code-char #x001E)
#\\c)
'string)))
(serapeum:lines
string
:eol-style (lambda (c)
(serapeum:in
c #\\Return #\\Linefeed
#.(code-char #x000B) ; #\\Vt (vertical tab)
#\\Page ; Form feed
#.(code-char #x001C) ; #\\Fs (file separator)
#.(code-char #x001D) ; #\\Gs (group separator)
#.(code-char #x001E) ; #\\Rs (record separator)
#.(code-char #x0085) ; Next line
#.(code-char #x2028) ; #\\Line_Separator
#.(code-char #x2029))) ; #\\Paragraph_Separator
:honor-crlf t
:keep-eols t))
=> (\"a^]\" \"b^^\" \"c\")
;; where ^] is group separator and ^^ is record separator.
To omit empty lines \(thus uniformizing LINES's behavior across Common
Lisp implementations):
#.(ecase #\\Newline ((#\\Return #\\Linefeed)))
(let ((string (coerce '(#\\a #\\b #\\c
#\\Return #\\Return #\\Linefeed #\\Linefeed
#\\z)
'string)))
(delete-if #'uiop:emptyp (serapeum:lines string :eol-style :unicode)))
=> (\"abc\" \"z\")
To additionally omit lines consisting only of whitespace:
#.(ecase #\\Newline ((#\\Return #\\Linefeed)))
(let ((string (coerce '(#\\a #\\b #\\c
#\\Return #\\Return #\\Linefeed #\\Linefeed
#\\Space #\\Linefeed
#\\Tab #\\Linefeed
#\\z)
'string)))
(delete-if #'uiop:emptyp
(mapcar #'serapeum:trim-whitespace
(serapeum:lines string :eol-style :unicode))))
=> (\"abc\" \"z\")"
(let* ((honor-crlf (cond ((not eol-style) nil)
((eql eol-style :crlf))
(honor-crlf-p honor-crlf)
;; If HONOR-CRLF was supplied, it takes
;; precedence over these.
((in eol-style :ascii :unicode))))
;; To honor CRLF, we must search for #\Return, but that does
;; not necessarily mean that we should split on it.
(ignore-cr (in eol-style :lf :crlf))
(cr-p (lambda (c) (eql c #\Return)))
(cr-or-lf-p (lambda (c) (in c #\Return #\Linefeed)))
(eolp (etypecase eol-style
((or null keyword)
(ecase eol-style
((nil) (lambda (c) (eql c #\Newline)))
((:cr :crlf) cr-p)
(:lf (if honor-crlf
cr-or-lf-p
(lambda (c) (eql c #\Linefeed))))
(:ascii cr-or-lf-p)
(:unicode
(lambda (c)
(in c #\Return #\Linefeed
#.(code-char #x0085) ; Next line
#.(code-char #x000B) ; #\Vt (vertical tab)
#\Page ; Form feed
#.(code-char #x2028) ; #\Line_Separator
#.(code-char #x2029)))))) ; #\Paragraph_Separator
(function
(if (and honor-crlf
;; Do not split on #\Return if EOL-STYLE does
;; not already include it.
(setf ignore-cr (not (funcall eol-style #\Return))))
(disjoin eol-style cr-p)
eol-style)))))
(flet ((next-eol (start) (position-if eolp string :start start)))
(do* ((length (length string))
(line nil (subseq string start
(if keep-eols (1+ end) (- end crlf-offset))))
(lines nil (push line lines))
(line-count 0 (1+ line-count))
(start 0 (1+ end))
(end (next-eol start) (next-eol start))
(crlf-offset 0 0))
((or (not end)
(eql line-count count))
(if (eql line-count count)
(nreverse lines)
(nreverse (if (emptyp (setf line (subseq string start)))
lines
(cons line lines)))))
again
(when (and (eql (char string end) #\Return)
honor-crlf)
(let ((end+1 (1+ end)))
(if (< end+1 length)
(if (eql (char string end+1) #\Linefeed)
(setf end end+1
crlf-offset 1)
(when ignore-cr
(if (setf end (next-eol end+1))
(go again)
(return (nreverse (push (subseq string start)
lines))))))
(when ignore-cr
(return (nreverse (push (subseq string start)
lines)))))))))))
(-> fmt ((or string function) &rest t) string)
(defun fmt (control-string &rest args)
"A cousin of `format` expressly for fast formatting of strings.
Like (format nil ...), binding `*print-pretty*' to `nil', which in
some Lisps means a significant increase in speed.
Has a compiler macro with `formatter'."
(declare (dynamic-extent args))
(let ((*print-pretty* nil))
(the string (format nil "~?" control-string args))))
(define-compiler-macro fmt (control-string &rest args)
;; NB We want to expand into a call to `format' whenever possible,
;; so Lisp can check the number of arguments.
`(the string
,(if (stringp control-string)
;; Optimize some trivial control strings. You wouldn't
;; necessarily write these, but it's common for a complex
;; control string to "erode" over time into something
;; trivial.
(cond
;; No directives.
((not (find #\~ control-string))
`(copy-seq ,control-string))
;; Same as `princ'.
((member control-string '("~a" "~d" "~f" "~g")
:test #'equalp)
(destructuring-bind (arg) args
`(let (*print-pretty*)
(princ-to-string ,arg))))
;; Same as `prin1'.
((equalp control-string "~s")
(destructuring-bind (arg) args
`(let (*print-pretty*)
(prin1-to-string ,arg))))
(t
`(let (*print-pretty*)
(format nil (formatter ,control-string) ,@args))))
`(let (*print-pretty*)
(format nil ,control-string ,@args)))))
(defun escape/no-arg-parsing (string table start end stream)
(declare (string string)
((or function hash-table) table)
((or array-index null) start)
((or array-length null) end)
(optimize (debug 0) (safety 1)
(compilation-speed 0)
(space 0))
;; Suppress unreachable code warnings.
#+sbcl (sb-ext:muffle-conditions sb-ext:code-deletion-note))
(let ((start (or start 0))
(end (or end (length string))))
(declare (array-index start)
(array-length end))
(with-string (stream stream)
(with-type-dispatch (function hash-table) table
(with-string-dispatch () string
(flet ((rep (c)
(etypecase table
(function (funcall table c))
(hash-table (gethash c table)))))
(declare (inline rep))
(nlet escape ((start start))
(when (< start end)
(let ((next (position-if #'rep string
:start start
:end end)))
(if (not next)
(write-string string stream :start start :end end)
(progn
(write-string string stream :start start :end next)
(let ((escape (rep (vref string next))))
(unless (emptyp escape)
(write-string escape stream))
(escape (1+ next))))))))))))))
(defun escape (string table &key (start 0) end stream)
"Write STRING to STREAM, escaping with TABLE.
TABLE should be either a hash table, with characters for keys and
strings for values, or a function that takes a character and
returns (only) either a string or null.
That is, the signature of TABLE should be:
(function (character) (or string null))
where `nil' means to pass the character through unchanged.
STREAM can be used to specify a stream to write to, like the first
argument to `format'. The default behavior, with no stream specified,
is to return a string."
(escape/no-arg-parsing string table
start
end
stream))
(define-compiler-macro escape (string table &key (start 0) end stream)
`(escape/no-arg-parsing ,string
,table
,start
,end
,stream))
(-> ellipsize (string array-length &key (:ellipsis string)) string)
(defun ellipsize (string n &key (ellipsis "..."))
"If STRING is longer than N, truncate it and append ELLIPSIS.
Note that the resulting string is longer than N by the length of
ELLIPSIS, so if N is very small the string may come out longer than it
started.
(ellipsize \"abc\" 2)
=> \"ab...\"
From Arc."
(if (> (length string) n)
(concat (string-right-trim whitespace (subseq string 0 n))
ellipsis)
string))
;;; TODO These are frequently used; it might be worth rewriting them
;;; to use `with-string-dispatch' instead of mismatch.
(macrolet ((defcmp ((name1 name2) (s1 s2 &rest keys) &body body)
(let ((docstring (if (stringp (car body)) (pop body))))
(flet ((mkdef (name &key docstring)
`(defsubst ,name (,s1 ,s2 &key (start1 0) end1 (start2 0) end2 ,@keys)
,@(unsplice docstring)
(declare (array-length start1 start2))
(declare (inline mismatch))
(let* ((,s1 (string ,s1))
(,s2 (string ,s2))
(end1 (or end1 (length ,s1)))
(end2 (or end2 (length ,s2))))
(declare (array-length end1 end2))
(declare (string ,s1 ,s2))
(macrolet ((call (fun &rest args)
`(,fun ,@args
:start1 start1 :start2 start2
:end1 end1 :end2 end2
:test test)))
,@body)))))
`(progn
,(subst '#'char= 'test (mkdef name1 :docstring docstring))
,(subst '#'char-equal 'test
(let ((docstring (format nil "Like `~(~a~)', but case-insensitive." name1)))
(mkdef name2 :docstring docstring))))))))
(defcmp (string^= string-prefix-p) (prefix string)
"Is PREFIX a prefix of STRING?"
(let ((ms (call mismatch prefix string)))
(or (not ms) (= ms end1))))
;; Optimization: the prefix is a single character.
(macrolet ((dcm (name test)
`(define-compiler-macro ,name (&whole call prefix string &rest args)
(if args call
(if (and (stringp prefix)
(= (length prefix) 1))
`((lambda (s)
(and (plusp (length s))
(,',test ,(character prefix)
(aref s 0))))
(string ,string))
call)))))
(dcm string^= char=)
(dcm string-prefix-p char-equal))
(defcmp (string$= string-suffix-p) (suffix string)
"Is SUFFIX a suffix of STRING?"
(let ((ms (call mismatch suffix string :from-end t)))
(or (not ms) (= ms start1))))
;; Optimization: the suffix is a single character.
(macrolet ((dcm (name test)
`(define-compiler-macro ,name (&whole call suffix string &rest args)
(if args call
(if (and (stringp suffix)
(= (length suffix) 1))
`((lambda (s)
(and (plusp (length s))
(,',test ,(character suffix)
(aref s (1- (length s))))))
(string ,string))
call)))))
(dcm string$= char=)
(dcm string-suffix-p char-equal))
(defcmp (string*= string-contains-p) (substring string)
"Is SUBSTRING a substring of STRING?
This is similar, but not identical, to SEARCH.
(search nil \"foo\") => 0
(search \"nil\" \"nil\") => 0
(string*= nil \"foo\") => NIL
(string*= nil \"nil\") => T"
(call search substring string))
;; Optimization: the substring is a single character.
(macrolet ((dcm (name test)
`(define-compiler-macro ,name (&whole call substring string &rest args)
(if args call
(if (and (stringp substring)
(= (length substring) 1))
`(position ,(character substring) (string ,string) :test #',',test)
call)))))
(dcm string*= char=)
(dcm string-contains-p char-equal))
(defcmp (string~= string-token-p) (token string)
"Does TOKEN occur in STRING as a token?
Equivalent to
(find TOKEN (tokens STRING) :test #'string=),
but without consing."
;; Adapted from split-sequence.
(let ((len (length string))
(end end2))
(declare (array-length len end))
(macrolet ((compare-segment (left right)
`(not (mismatch token string :start1 start1 :end1 end1
:start2 ,left :end2 ,right
:test test))))
(loop for left of-type array-length
= start2
then (+ right 1)
for right of-type array-length
= (min (or (position-if #'whitespacep string :start left) len)
end)
thereis (and (not (= right left))
(compare-segment left right))
until (>= right end))))))
(defun string-replace-all (old string new &key (start 0) end stream count)
"Do search-and-replace for constant strings.
Note that START and END only affect where the replacements are made:
the part of the string before START, and the part after END, are
always included verbatim.
(string-replace-all \"old\" \"The old old way\" \"new\"
:start 3 :end 6)
=> \"The new old way\"
COUNT can be used to limit the maximum number of occurrences to
replace. If COUNT is not specified, every occurrence of OLD between
START and END is replaced with NEW.
(string-replace-all \"foo\" \"foo foo foo\" \"quux\")
=> \"quux quux quux\"
(string-replace-all \"foo\" \"foo foo foo\" \"quux\" :count 2)
=> \"quux quux foo\"
STREAM can be used to specify a stream to write to. It is resolved
like the first argument to `format'."
(declare ((or null array-index) start)
((or array-length null) end)
;; Can't be more matches than characters.
((or array-length null) count))
(check-type old string)
(check-type new string)
(check-type string string)
(let ((start (or start 0))
(new (simplify-string new))
(old (simplify-string old)))
(declare (array-index start))
(with-string (s stream)
(with-string-dispatch () string
(let* ((end (or end (length string)))
(len (length old)))
(declare (array-length len))
(unless (zerop start)
(write-string string s :start 0 :end start))
(nlet rep ((start start)
(count (or count (1- array-dimension-limit))))
(declare (array-length start count))
(let ((match (search old string :start2 start :end2 end)))
(declare ((or array-length null) match))
(if (or (not match) (zerop count))
;; No end, because we want the whole remainder of the
;; string.
(write-string string s :start start)
(progn
(write-string string s :start start :end match)
(write-string new s)
(rep (+ match len)
(1- count)))))))))))
(defun string-replace (old string new &key (start 0) end stream)
"Like `string-replace-all', but only replace the first match."
(string-replace-all old string new
:start start
:end end
:stream stream
:count 1))
(-> chomp (string &optional list) string)
(defun chomp (string
&optional
(suffixes '#.(sort
(remove-duplicates
(list (string #\Newline)
(string #\Linefeed)
(string #\Return)
(coerce (list #\Return #\Linefeed) 'string))
:test 'equal)
#'> :key #'length)
suffixes-supplied-p))
"If STRING ends in one of SUFFIXES, remove that suffix.
SUFFIXES defaults to a Lisp newline, a literal line feed, a literal
carriage return, or a literal carriage return followed by a literal
line feed.
Takes care that the longest suffix is always removed first."
(check-type string string)
(check-type suffixes list)
(reduce (lambda (string sep)
(if (string$= sep string)
(subseq string 0 (- (length string) (length sep)))
string))
(if suffixes-supplied-p
(sort (copy-seq suffixes) #'> :key #'length)
suffixes)
:initial-value string))
(defun string-count (substring string &key (start 0) end)
"Count how many times SUBSTRING appears in STRING."
(declare (array-length start)
((or array-length null) end))
(let* ((substring (simplify-string (string substring)))
(string (string string))
(end (or end (length string)))
(len (length substring)))
(with-string-dispatch () string
(nlet rec ((start start)
(hits 0))
;; There can't be more hits than characters.
(declare (type array-length hits))
(let ((match (search substring string :start2 start :end2 end)))
(if (not match)
hits
(rec (+ match len) (1+ hits))))))))
(declaim (ftype (function (&rest t) string) string+))
(defun string+ (&rest args)
"Optimized function for building small strings.
Roughly equivalent to
(let ((*print-pretty* nil))
(format nil \"~@{~a}\" args...))
But with a compiler macro that can sometimes result in more efficient
code."
(declare (dynamic-extent args))
(if (null args) ""
(let ((*print-pretty* nil))
(with-output-to-string (s)
(dolist (arg args)
(typecase arg
(string (write-string arg s))
(character (write-char arg s))
(t (princ arg s))))))))
(defun simplify-args-for-string-plus (args &optional env)
(reduce (lambda (x args)
;; Merge together runs of strings.
(if (and (stringp x)
(stringp (car args)))
(cons (concat x (car args))
(cdr args))
(cons x args)))
args
:from-end t
:key (named-lambda stringify (arg)
;; Stringify constant arguments when possible.
(trivia:match arg
((and arg (type character))
(string arg))
((or (and sym (type keyword))
(list 'quote (and sym (type symbol))))
;; Note that `*print-case*' may affect how symbols
;; are printed, so even if the symbol is constant
;; we can only be sure of the printed
;; representation if there are no uppercase
;; characters.
(let ((s (symbol-name sym)))
(if (notany #'upper-case-p s) s
arg)))
;; The smallest base is 2, so these are always the
;; same regardless of `*print-base*'.
((eql 0) "0")
((eql 1) "1")
((list 'quote (and s (type string)))
s)
((list 'quote (and c (type character)))
(string c))
(otherwise
(multiple-value-bind (val evaluated?)
(eval-if-constant arg env)
(if (equal arg val) arg
(if evaluated?
(stringify `(quote ,val))
arg))))))
:initial-value '()))
(define-compiler-macro string+ (&whole call
&environment env
&rest args)
(if (null args) ""
(let ((args (simplify-args-for-string-plus args env)))
(if (> (length args) 20) call
(if (= (length args) 1)
(if (stringp (first args))
`(copy-seq ,(first args))
`(princ-to-string ,(first args)))
;; If the arguments are reasonably few, unroll the
;; loop.
(with-unique-names (stream)
`(let ((*print-pretty* nil))
(with-output-to-string (,stream)
,@(loop for arg in args
if (stringp arg)
collect `(write-string ,arg ,stream)
else
collect `(princ ,arg ,stream))))))))))