Ruby 3.3.2p78 (2024-05-30 revision e5a195edf62fe1bf7146a191da13fa1c4fecbd71)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* FLAGS of RString
82 *
83 * 1: RSTRING_NOEMBED
84 * 2: STR_SHARED (== ELTS_SHARED)
85 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
86 * other strings that rely on this string's buffer)
87 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
88 * early, specific to rb_str_tmp_frozen_{acquire,release})
89 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
90 * such as read(2). Any modification and realloc is prohibited)
91 *
92 * 8-9: ENC_CODERANGE (2 bits)
93 * 10-16: ENCODING (7 bits == 128)
94 * 17: RSTRING_FSTR
95 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
96 * used for a string object based on C string literal)
97 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
98 * object header is temporarily allocated on C stack)
99 */
100
101#define RUBY_MAX_CHAR_LEN 16
102#define STR_SHARED_ROOT FL_USER5
103#define STR_BORROWED FL_USER6
104#define STR_TMPLOCK FL_USER7
105#define STR_NOFREE FL_USER18
106#define STR_FAKESTR FL_USER19
107
108#define STR_SET_NOEMBED(str) do {\
109 FL_SET((str), STR_NOEMBED);\
110 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
111} while (0)
112#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
113
114#define STR_SET_LEN(str, n) do { \
115 RSTRING(str)->len = (n); \
116} while (0)
117
118static inline bool
119str_enc_fastpath(VALUE str)
120{
121 // The overwhelming majority of strings are in one of these 3 encodings.
122 switch (ENCODING_GET_INLINED(str)) {
123 case ENCINDEX_ASCII_8BIT:
124 case ENCINDEX_UTF_8:
125 case ENCINDEX_US_ASCII:
126 return true;
127 default:
128 return false;
129 }
130}
131
132#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
133#define TERM_FILL(ptr, termlen) do {\
134 char *const term_fill_ptr = (ptr);\
135 const int term_fill_len = (termlen);\
136 *term_fill_ptr = '\0';\
137 if (UNLIKELY(term_fill_len > 1))\
138 memset(term_fill_ptr, 0, term_fill_len);\
139} while (0)
140
141#define RESIZE_CAPA(str,capacity) do {\
142 const int termlen = TERM_LEN(str);\
143 RESIZE_CAPA_TERM(str,capacity,termlen);\
144} while (0)
145#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
146 if (STR_EMBED_P(str)) {\
147 if (str_embed_capa(str) < capacity + termlen) {\
148 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
149 const long tlen = RSTRING_LEN(str);\
150 memcpy(tmp, RSTRING_PTR(str), tlen);\
151 RSTRING(str)->as.heap.ptr = tmp;\
152 RSTRING(str)->len = tlen;\
153 STR_SET_NOEMBED(str);\
154 RSTRING(str)->as.heap.aux.capa = (capacity);\
155 }\
156 }\
157 else {\
158 assert(!FL_TEST((str), STR_SHARED)); \
159 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
160 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
161 RSTRING(str)->as.heap.aux.capa = (capacity);\
162 }\
163} while (0)
164
165#define STR_SET_SHARED(str, shared_str) do { \
166 if (!FL_TEST(str, STR_FAKESTR)) { \
167 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
168 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
169 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
170 FL_SET((str), STR_SHARED); \
171 FL_SET((shared_str), STR_SHARED_ROOT); \
172 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
173 FL_SET_RAW((shared_str), STR_BORROWED); \
174 } \
175} while (0)
176
177#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
178#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
179/* TODO: include the terminator size in capa. */
180
181#define STR_ENC_GET(str) get_encoding(str)
182
183#if !defined SHARABLE_MIDDLE_SUBSTRING
184# define SHARABLE_MIDDLE_SUBSTRING 0
185#endif
186#if !SHARABLE_MIDDLE_SUBSTRING
187#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
188#else
189#define SHARABLE_SUBSTRING_P(beg, len, end) 1
190#endif
191
192
193static inline long
194str_embed_capa(VALUE str)
195{
196 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
197}
198
199bool
200rb_str_reembeddable_p(VALUE str)
201{
202 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
203}
204
205static inline size_t
206rb_str_embed_size(long capa)
207{
208 return offsetof(struct RString, as.embed.ary) + capa;
209}
210
211size_t
212rb_str_size_as_embedded(VALUE str)
213{
214 size_t real_size;
215 if (STR_EMBED_P(str)) {
216 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
217 }
218 /* if the string is not currently embedded, but it can be embedded, how
219 * much space would it require */
220 else if (rb_str_reembeddable_p(str)) {
221 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
222 }
223 else {
224 real_size = sizeof(struct RString);
225 }
226 return real_size;
227}
228
229static inline bool
230STR_EMBEDDABLE_P(long len, long termlen)
231{
232 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
233}
234
235static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
236static VALUE str_new_frozen(VALUE klass, VALUE orig);
237static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
238static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
239static VALUE str_new(VALUE klass, const char *ptr, long len);
240static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
241static inline void str_modifiable(VALUE str);
242static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
243
244static inline void
245str_make_independent(VALUE str)
246{
247 long len = RSTRING_LEN(str);
248 int termlen = TERM_LEN(str);
249 str_make_independent_expand((str), len, 0L, termlen);
250}
251
252static inline int str_dependent_p(VALUE str);
253
254void
255rb_str_make_independent(VALUE str)
256{
257 if (str_dependent_p(str)) {
258 str_make_independent(str);
259 }
260}
261
262void
263rb_str_make_embedded(VALUE str)
264{
265 RUBY_ASSERT(rb_str_reembeddable_p(str));
266 RUBY_ASSERT(!STR_EMBED_P(str));
267
268 char *buf = RSTRING(str)->as.heap.ptr;
269 long len = RSTRING(str)->len;
270
271 STR_SET_EMBED(str);
272 STR_SET_LEN(str, len);
273
274 if (len > 0) {
275 memcpy(RSTRING_PTR(str), buf, len);
276 ruby_xfree(buf);
277 }
278
279 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
280}
281
282void
283rb_debug_rstring_null_ptr(const char *func)
284{
285 fprintf(stderr, "%s is returning NULL!! "
286 "SIGSEGV is highly expected to follow immediately.\n"
287 "If you could reproduce, attach your debugger here, "
288 "and look at the passed string.\n",
289 func);
290}
291
292/* symbols for [up|down|swap]case/capitalize options */
293static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
294
295static rb_encoding *
296get_encoding(VALUE str)
297{
298 return rb_enc_from_index(ENCODING_GET(str));
299}
300
301static void
302mustnot_broken(VALUE str)
303{
304 if (is_broken_string(str)) {
305 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
306 }
307}
308
309static void
310mustnot_wchar(VALUE str)
311{
312 rb_encoding *enc = STR_ENC_GET(str);
313 if (rb_enc_mbminlen(enc) > 1) {
314 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
315 }
316}
317
318static int fstring_cmp(VALUE a, VALUE b);
319
320static VALUE register_fstring(VALUE str, bool copy);
321
322const struct st_hash_type rb_fstring_hash_type = {
323 fstring_cmp,
325};
326
327#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
328
330 VALUE fstr;
331 bool copy;
332};
333
334static int
335fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
336{
337
338 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
339 VALUE str = (VALUE)*key;
340
341 if (existing) {
342 /* because of lazy sweep, str may be unmarked already and swept
343 * at next time */
344
345 if (rb_objspace_garbage_object_p(str)) {
346 arg->fstr = Qundef;
347 return ST_DELETE;
348 }
349
350 arg->fstr = str;
351 return ST_STOP;
352 }
353 else {
354 if (FL_TEST_RAW(str, STR_FAKESTR)) {
355 if (arg->copy) {
356 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
357 rb_enc_copy(new_str, str);
358 str = new_str;
359 }
360 else {
361 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
362 RSTRING(str)->len,
363 ENCODING_GET(str));
364 }
365 OBJ_FREEZE_RAW(str);
366 }
367 else {
368 if (!OBJ_FROZEN(str))
369 str = str_new_frozen(rb_cString, str);
370 if (STR_SHARED_P(str)) { /* str should not be shared */
371 /* shared substring */
372 str_make_independent(str);
373 assert(OBJ_FROZEN(str));
374 }
375 if (!BARE_STRING_P(str)) {
376 str = str_new_frozen(rb_cString, str);
377 }
378 }
379 RBASIC(str)->flags |= RSTRING_FSTR;
380
381 *key = *value = arg->fstr = str;
382 return ST_CONTINUE;
383 }
384}
385
386RUBY_FUNC_EXPORTED
387VALUE
388rb_fstring(VALUE str)
389{
390 VALUE fstr;
391 int bare;
392
393 Check_Type(str, T_STRING);
394
395 if (FL_TEST(str, RSTRING_FSTR))
396 return str;
397
398 bare = BARE_STRING_P(str);
399 if (!bare) {
400 if (STR_EMBED_P(str)) {
401 OBJ_FREEZE_RAW(str);
402 return str;
403 }
404
405 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
406 assert(OBJ_FROZEN(str));
407 return str;
408 }
409 }
410
411 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE))
412 rb_str_resize(str, RSTRING_LEN(str));
413
414 fstr = register_fstring(str, FALSE);
415
416 if (!bare) {
417 str_replace_shared_without_enc(str, fstr);
418 OBJ_FREEZE_RAW(str);
419 return str;
420 }
421 return fstr;
422}
423
424static VALUE
425register_fstring(VALUE str, bool copy)
426{
427 struct fstr_update_arg args;
428 args.copy = copy;
429
430 RB_VM_LOCK_ENTER();
431 {
432 st_table *frozen_strings = rb_vm_fstring_table();
433 do {
434 args.fstr = str;
435 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
436 } while (UNDEF_P(args.fstr));
437 }
438 RB_VM_LOCK_LEAVE();
439
440 assert(OBJ_FROZEN(args.fstr));
441 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
442 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
443 assert(RBASIC_CLASS(args.fstr) == rb_cString);
444 return args.fstr;
445}
446
447static VALUE
448setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
449{
450 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
451 /* SHARED to be allocated by the callback */
452
453 if (!name) {
455 name = "";
456 }
457
458 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
459
460 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
461 fake_str->len = len;
462 fake_str->as.heap.ptr = (char *)name;
463 fake_str->as.heap.aux.capa = len;
464 return (VALUE)fake_str;
465}
466
467/*
468 * set up a fake string which refers a static string literal.
469 */
470VALUE
471rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
472{
473 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
474}
475
476/*
477 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
478 * shared string which refers a static string literal. `ptr` must
479 * point a constant string.
480 */
481VALUE
482rb_fstring_new(const char *ptr, long len)
483{
484 struct RString fake_str;
485 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
486}
487
488VALUE
489rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
490{
491 struct RString fake_str;
492 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
493}
494
495VALUE
496rb_fstring_cstr(const char *ptr)
497{
498 return rb_fstring_new(ptr, strlen(ptr));
499}
500
501static int
502fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
503{
504 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
505 return ST_CONTINUE;
506}
507
508static int
509fstring_cmp(VALUE a, VALUE b)
510{
511 long alen, blen;
512 const char *aptr, *bptr;
513 RSTRING_GETMEM(a, aptr, alen);
514 RSTRING_GETMEM(b, bptr, blen);
515 return (alen != blen ||
516 ENCODING_GET(a) != ENCODING_GET(b) ||
517 memcmp(aptr, bptr, alen) != 0);
518}
519
520static inline int
521single_byte_optimizable(VALUE str)
522{
523 rb_encoding *enc;
524
525 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
527 return 1;
528
529 enc = STR_ENC_GET(str);
530 if (rb_enc_mbmaxlen(enc) == 1)
531 return 1;
532
533 /* Conservative. Possibly single byte.
534 * "\xa1" in Shift_JIS for example. */
535 return 0;
536}
537
539
540static inline const char *
541search_nonascii(const char *p, const char *e)
542{
543 const uintptr_t *s, *t;
544
545#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
546# if SIZEOF_UINTPTR_T == 8
547# define NONASCII_MASK UINT64_C(0x8080808080808080)
548# elif SIZEOF_UINTPTR_T == 4
549# define NONASCII_MASK UINT32_C(0x80808080)
550# else
551# error "don't know what to do."
552# endif
553#else
554# if SIZEOF_UINTPTR_T == 8
555# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
556# elif SIZEOF_UINTPTR_T == 4
557# define NONASCII_MASK 0x80808080UL /* or...? */
558# else
559# error "don't know what to do."
560# endif
561#endif
562
563 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
564#if !UNALIGNED_WORD_ACCESS
565 if ((uintptr_t)p % SIZEOF_VOIDP) {
566 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
567 p += l;
568 switch (l) {
569 default: UNREACHABLE;
570#if SIZEOF_VOIDP > 4
571 case 7: if (p[-7]&0x80) return p-7;
572 case 6: if (p[-6]&0x80) return p-6;
573 case 5: if (p[-5]&0x80) return p-5;
574 case 4: if (p[-4]&0x80) return p-4;
575#endif
576 case 3: if (p[-3]&0x80) return p-3;
577 case 2: if (p[-2]&0x80) return p-2;
578 case 1: if (p[-1]&0x80) return p-1;
579 case 0: break;
580 }
581 }
582#endif
583#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
584#define aligned_ptr(value) \
585 __builtin_assume_aligned((value), sizeof(uintptr_t))
586#else
587#define aligned_ptr(value) (uintptr_t *)(value)
588#endif
589 s = aligned_ptr(p);
590 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
591#undef aligned_ptr
592 for (;s < t; s++) {
593 if (*s & NONASCII_MASK) {
594#ifdef WORDS_BIGENDIAN
595 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
596#else
597 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
598#endif
599 }
600 }
601 p = (const char *)s;
602 }
603
604 switch (e - p) {
605 default: UNREACHABLE;
606#if SIZEOF_VOIDP > 4
607 case 7: if (e[-7]&0x80) return e-7;
608 case 6: if (e[-6]&0x80) return e-6;
609 case 5: if (e[-5]&0x80) return e-5;
610 case 4: if (e[-4]&0x80) return e-4;
611#endif
612 case 3: if (e[-3]&0x80) return e-3;
613 case 2: if (e[-2]&0x80) return e-2;
614 case 1: if (e[-1]&0x80) return e-1;
615 case 0: return NULL;
616 }
617}
618
619static int
620coderange_scan(const char *p, long len, rb_encoding *enc)
621{
622 const char *e = p + len;
623
624 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
625 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
626 p = search_nonascii(p, e);
628 }
629
630 if (rb_enc_asciicompat(enc)) {
631 p = search_nonascii(p, e);
632 if (!p) return ENC_CODERANGE_7BIT;
633 for (;;) {
634 int ret = rb_enc_precise_mbclen(p, e, enc);
636 p += MBCLEN_CHARFOUND_LEN(ret);
637 if (p == e) break;
638 p = search_nonascii(p, e);
639 if (!p) break;
640 }
641 }
642 else {
643 while (p < e) {
644 int ret = rb_enc_precise_mbclen(p, e, enc);
646 p += MBCLEN_CHARFOUND_LEN(ret);
647 }
648 }
649 return ENC_CODERANGE_VALID;
650}
651
652long
653rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
654{
655 const char *p = s;
656
657 if (*cr == ENC_CODERANGE_BROKEN)
658 return e - s;
659
660 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
661 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
662 if (*cr == ENC_CODERANGE_VALID) return e - s;
663 p = search_nonascii(p, e);
665 return e - s;
666 }
667 else if (rb_enc_asciicompat(enc)) {
668 p = search_nonascii(p, e);
669 if (!p) {
670 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
671 return e - s;
672 }
673 for (;;) {
674 int ret = rb_enc_precise_mbclen(p, e, enc);
675 if (!MBCLEN_CHARFOUND_P(ret)) {
677 return p - s;
678 }
679 p += MBCLEN_CHARFOUND_LEN(ret);
680 if (p == e) break;
681 p = search_nonascii(p, e);
682 if (!p) break;
683 }
684 }
685 else {
686 while (p < e) {
687 int ret = rb_enc_precise_mbclen(p, e, enc);
688 if (!MBCLEN_CHARFOUND_P(ret)) {
690 return p - s;
691 }
692 p += MBCLEN_CHARFOUND_LEN(ret);
693 }
694 }
696 return e - s;
697}
698
699static inline void
700str_enc_copy(VALUE str1, VALUE str2)
701{
702 rb_enc_set_index(str1, ENCODING_GET(str2));
703}
704
705/* Like str_enc_copy, but does not check frozen status of str1.
706 * You should use this only if you're certain that str1 is not frozen. */
707static inline void
708str_enc_copy_direct(VALUE str1, VALUE str2)
709{
710 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
711 if (inlined_encoding == ENCODING_INLINE_MAX) {
712 rb_enc_set_index(str1, rb_enc_get_index(str2));
713 }
714 else {
715 ENCODING_SET_INLINED(str1, inlined_encoding);
716 }
717}
718
719static void
720rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
721{
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
724 */
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
729 else
731 return;
732 }
733 switch (ENC_CODERANGE(src)) {
736 break;
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
741 else
743 break;
744 default:
745 break;
746 }
747}
748
749static void
750rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
751{
752 str_enc_copy(dest, src);
754}
755
756static int
757enc_coderange_scan(VALUE str, rb_encoding *enc)
758{
759 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
760}
761
762int
763rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
764{
765 return enc_coderange_scan(str, enc);
766}
767
768int
770{
771 int cr = ENC_CODERANGE(str);
772
773 if (cr == ENC_CODERANGE_UNKNOWN) {
774 cr = enc_coderange_scan(str, get_encoding(str));
775 ENC_CODERANGE_SET(str, cr);
776 }
777 return cr;
778}
779
780int
782{
783 rb_encoding *enc = STR_ENC_GET(str);
784
785 if (!rb_enc_asciicompat(enc))
786 return FALSE;
787 else if (is_ascii_string(str))
788 return TRUE;
789 return FALSE;
790}
791
792static inline void
793str_mod_check(VALUE s, const char *p, long len)
794{
795 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
796 rb_raise(rb_eRuntimeError, "string modified");
797 }
798}
799
800static size_t
801str_capacity(VALUE str, const int termlen)
802{
803 if (STR_EMBED_P(str)) {
804 return str_embed_capa(str) - termlen;
805 }
806 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
807 return RSTRING(str)->len;
808 }
809 else {
810 return RSTRING(str)->as.heap.aux.capa;
811 }
812}
813
814size_t
816{
817 return str_capacity(str, TERM_LEN(str));
818}
819
820static inline void
821must_not_null(const char *ptr)
822{
823 if (!ptr) {
824 rb_raise(rb_eArgError, "NULL pointer given");
825 }
826}
827
828static inline VALUE
829str_alloc_embed(VALUE klass, size_t capa)
830{
831 size_t size = rb_str_embed_size(capa);
832 assert(size > 0);
833 assert(rb_gc_size_allocatable_p(size));
834
835 NEWOBJ_OF(str, struct RString, klass,
837
838 return (VALUE)str;
839}
840
841static inline VALUE
842str_alloc_heap(VALUE klass)
843{
844 NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
846
847 return (VALUE)str;
848}
849
850static inline VALUE
851empty_str_alloc(VALUE klass)
852{
853 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
854 VALUE str = str_alloc_embed(klass, 0);
855 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
856 return str;
857}
858
859static VALUE
860str_new0(VALUE klass, const char *ptr, long len, int termlen)
861{
862 VALUE str;
863
864 if (len < 0) {
865 rb_raise(rb_eArgError, "negative string size (or size too big)");
866 }
867
868 RUBY_DTRACE_CREATE_HOOK(STRING, len);
869
870 if (STR_EMBEDDABLE_P(len, termlen)) {
871 str = str_alloc_embed(klass, len + termlen);
872 if (len == 0) {
874 }
875 }
876 else {
877 str = str_alloc_heap(klass);
878 RSTRING(str)->as.heap.aux.capa = len;
879 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
880 * integer overflow. If we can STATIC_ASSERT that, the following
881 * mul_add_mul can be reverted to a simple ALLOC_N. */
882 RSTRING(str)->as.heap.ptr =
883 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
884 }
885 if (ptr) {
886 memcpy(RSTRING_PTR(str), ptr, len);
887 }
888 STR_SET_LEN(str, len);
889 TERM_FILL(RSTRING_PTR(str) + len, termlen);
890 return str;
891}
892
893static VALUE
894str_new(VALUE klass, const char *ptr, long len)
895{
896 return str_new0(klass, ptr, len, 1);
897}
898
899VALUE
900rb_str_new(const char *ptr, long len)
901{
902 return str_new(rb_cString, ptr, len);
903}
904
905VALUE
906rb_usascii_str_new(const char *ptr, long len)
907{
908 VALUE str = rb_str_new(ptr, len);
909 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
910 return str;
911}
912
913VALUE
914rb_utf8_str_new(const char *ptr, long len)
915{
916 VALUE str = str_new(rb_cString, ptr, len);
917 rb_enc_associate_index(str, rb_utf8_encindex());
918 return str;
919}
920
921VALUE
922rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
923{
924 VALUE str;
925
926 if (!enc) return rb_str_new(ptr, len);
927
928 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
929 rb_enc_associate(str, enc);
930 return str;
931}
932
933VALUE
935{
936 must_not_null(ptr);
937 /* rb_str_new_cstr() can take pointer from non-malloc-generated
938 * memory regions, and that cannot be detected by the MSAN. Just
939 * trust the programmer that the argument passed here is a sane C
940 * string. */
941 __msan_unpoison_string(ptr);
942 return rb_str_new(ptr, strlen(ptr));
943}
944
945VALUE
947{
949 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
950 return str;
951}
952
953VALUE
955{
957 rb_enc_associate_index(str, rb_utf8_encindex());
958 return str;
959}
960
961VALUE
963{
964 must_not_null(ptr);
965 if (rb_enc_mbminlen(enc) != 1) {
966 rb_raise(rb_eArgError, "wchar encoding given");
967 }
968 return rb_enc_str_new(ptr, strlen(ptr), enc);
969}
970
971static VALUE
972str_new_static(VALUE klass, const char *ptr, long len, int encindex)
973{
974 VALUE str;
975
976 if (len < 0) {
977 rb_raise(rb_eArgError, "negative string size (or size too big)");
978 }
979
980 if (!ptr) {
981 rb_encoding *enc = rb_enc_get_from_index(encindex);
982 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
983 }
984 else {
985 RUBY_DTRACE_CREATE_HOOK(STRING, len);
986 str = str_alloc_heap(klass);
987 RSTRING(str)->len = len;
988 RSTRING(str)->as.heap.ptr = (char *)ptr;
989 RSTRING(str)->as.heap.aux.capa = len;
990 RBASIC(str)->flags |= STR_NOFREE;
991 }
992 rb_enc_associate_index(str, encindex);
993 return str;
994}
995
996VALUE
997rb_str_new_static(const char *ptr, long len)
998{
999 return str_new_static(rb_cString, ptr, len, 0);
1000}
1001
1002VALUE
1004{
1005 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1006}
1007
1008VALUE
1010{
1011 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1012}
1013
1014VALUE
1016{
1017 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1018}
1019
1020static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1021 rb_encoding *from, rb_encoding *to,
1022 int ecflags, VALUE ecopts);
1023
1024static inline bool
1025is_enc_ascii_string(VALUE str, rb_encoding *enc)
1026{
1027 int encidx = rb_enc_to_index(enc);
1028 if (rb_enc_get_index(str) == encidx)
1029 return is_ascii_string(str);
1030 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1031}
1032
1033VALUE
1034rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1035{
1036 long len;
1037 const char *ptr;
1038 VALUE newstr;
1039
1040 if (!to) return str;
1041 if (!from) from = rb_enc_get(str);
1042 if (from == to) return str;
1043 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1044 rb_is_ascii8bit_enc(to)) {
1045 if (STR_ENC_GET(str) != to) {
1046 str = rb_str_dup(str);
1047 rb_enc_associate(str, to);
1048 }
1049 return str;
1050 }
1051
1052 RSTRING_GETMEM(str, ptr, len);
1053 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1054 from, to, ecflags, ecopts);
1055 if (NIL_P(newstr)) {
1056 /* some error, return original */
1057 return str;
1058 }
1059 return newstr;
1060}
1061
1062VALUE
1063rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1064 rb_encoding *from, int ecflags, VALUE ecopts)
1065{
1066 long olen;
1067
1068 olen = RSTRING_LEN(newstr);
1069 if (ofs < -olen || olen < ofs)
1070 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1071 if (ofs < 0) ofs += olen;
1072 if (!from) {
1073 STR_SET_LEN(newstr, ofs);
1074 return rb_str_cat(newstr, ptr, len);
1075 }
1076
1077 rb_str_modify(newstr);
1078 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1079 rb_enc_get(newstr),
1080 ecflags, ecopts);
1081}
1082
1083VALUE
1084rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1085{
1086 STR_SET_LEN(str, 0);
1087 rb_enc_associate(str, enc);
1088 rb_str_cat(str, ptr, len);
1089 return str;
1090}
1091
1092static VALUE
1093str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1094 rb_encoding *from, rb_encoding *to,
1095 int ecflags, VALUE ecopts)
1096{
1097 rb_econv_t *ec;
1099 long olen;
1100 VALUE econv_wrapper;
1101 const unsigned char *start, *sp;
1102 unsigned char *dest, *dp;
1103 size_t converted_output = (size_t)ofs;
1104
1105 olen = rb_str_capacity(newstr);
1106
1107 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1108 RBASIC_CLEAR_CLASS(econv_wrapper);
1109 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1110 if (!ec) return Qnil;
1111 DATA_PTR(econv_wrapper) = ec;
1112
1113 sp = (unsigned char*)ptr;
1114 start = sp;
1115 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1116 (dp = dest + converted_output),
1117 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1119 /* destination buffer short */
1120 size_t converted_input = sp - start;
1121 size_t rest = len - converted_input;
1122 converted_output = dp - dest;
1123 rb_str_set_len(newstr, converted_output);
1124 if (converted_input && converted_output &&
1125 rest < (LONG_MAX / converted_output)) {
1126 rest = (rest * converted_output) / converted_input;
1127 }
1128 else {
1129 rest = olen;
1130 }
1131 olen += rest < 2 ? 2 : rest;
1132 rb_str_resize(newstr, olen);
1133 }
1134 DATA_PTR(econv_wrapper) = 0;
1135 rb_econv_close(ec);
1136 switch (ret) {
1137 case econv_finished:
1138 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1139 rb_str_set_len(newstr, len);
1140 rb_enc_associate(newstr, to);
1141 return newstr;
1142
1143 default:
1144 return Qnil;
1145 }
1146}
1147
1148VALUE
1150{
1151 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1152}
1153
1154VALUE
1156{
1157 rb_encoding *ienc;
1158 VALUE str;
1159 const int eidx = rb_enc_to_index(eenc);
1160
1161 if (!ptr) {
1162 return rb_enc_str_new(ptr, len, eenc);
1163 }
1164
1165 /* ASCII-8BIT case, no conversion */
1166 if ((eidx == rb_ascii8bit_encindex()) ||
1167 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1168 return rb_str_new(ptr, len);
1169 }
1170 /* no default_internal or same encoding, no conversion */
1171 ienc = rb_default_internal_encoding();
1172 if (!ienc || eenc == ienc) {
1173 return rb_enc_str_new(ptr, len, eenc);
1174 }
1175 /* ASCII compatible, and ASCII only string, no conversion in
1176 * default_internal */
1177 if ((eidx == rb_ascii8bit_encindex()) ||
1178 (eidx == rb_usascii_encindex()) ||
1179 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1180 return rb_enc_str_new(ptr, len, ienc);
1181 }
1182 /* convert from the given encoding to default_internal */
1183 str = rb_enc_str_new(NULL, 0, ienc);
1184 /* when the conversion failed for some reason, just ignore the
1185 * default_internal and result in the given encoding as-is. */
1186 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1187 rb_str_initialize(str, ptr, len, eenc);
1188 }
1189 return str;
1190}
1191
1192VALUE
1193rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1194{
1195 int eidx = rb_enc_to_index(eenc);
1196 if (eidx == rb_usascii_encindex() &&
1197 !is_ascii_string(str)) {
1198 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1199 return str;
1200 }
1201 rb_enc_associate_index(str, eidx);
1202 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1203}
1204
1205VALUE
1206rb_external_str_new(const char *ptr, long len)
1207{
1208 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1209}
1210
1211VALUE
1213{
1214 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1215}
1216
1217VALUE
1218rb_locale_str_new(const char *ptr, long len)
1219{
1220 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1221}
1222
1223VALUE
1225{
1226 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1227}
1228
1229VALUE
1231{
1232 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1233}
1234
1235VALUE
1236rb_filesystem_str_new_cstr(const char *ptr)
1237{
1238 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1239}
1240
1241VALUE
1243{
1244 return rb_str_export_to_enc(str, rb_default_external_encoding());
1245}
1246
1247VALUE
1249{
1250 return rb_str_export_to_enc(str, rb_locale_encoding());
1251}
1252
1253VALUE
1255{
1256 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1257}
1258
1259static VALUE
1260str_replace_shared_without_enc(VALUE str2, VALUE str)
1261{
1262 const int termlen = TERM_LEN(str);
1263 char *ptr;
1264 long len;
1265
1266 RSTRING_GETMEM(str, ptr, len);
1267 if (str_embed_capa(str2) >= len + termlen) {
1268 char *ptr2 = RSTRING(str2)->as.embed.ary;
1269 STR_SET_EMBED(str2);
1270 memcpy(ptr2, RSTRING_PTR(str), len);
1271 TERM_FILL(ptr2+len, termlen);
1272 }
1273 else {
1274 VALUE root;
1275 if (STR_SHARED_P(str)) {
1276 root = RSTRING(str)->as.heap.aux.shared;
1277 RSTRING_GETMEM(str, ptr, len);
1278 }
1279 else {
1280 root = rb_str_new_frozen(str);
1281 RSTRING_GETMEM(root, ptr, len);
1282 }
1283 assert(OBJ_FROZEN(root));
1284 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1285 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1286 rb_fatal("about to free a possible shared root");
1287 }
1288 char *ptr2 = STR_HEAP_PTR(str2);
1289 if (ptr2 != ptr) {
1290 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1291 }
1292 }
1293 FL_SET(str2, STR_NOEMBED);
1294 RSTRING(str2)->as.heap.ptr = ptr;
1295 STR_SET_SHARED(str2, root);
1296 }
1297
1298 STR_SET_LEN(str2, len);
1299
1300 return str2;
1301}
1302
1303static VALUE
1304str_replace_shared(VALUE str2, VALUE str)
1305{
1306 str_replace_shared_without_enc(str2, str);
1307 rb_enc_cr_str_exact_copy(str2, str);
1308 return str2;
1309}
1310
1311static VALUE
1312str_new_shared(VALUE klass, VALUE str)
1313{
1314 return str_replace_shared(str_alloc_heap(klass), str);
1315}
1316
1317VALUE
1319{
1320 return str_new_shared(rb_obj_class(str), str);
1321}
1322
1323VALUE
1324rb_str_new_frozen(VALUE orig)
1325{
1326 if (OBJ_FROZEN(orig)) return orig;
1327 return str_new_frozen(rb_obj_class(orig), orig);
1328}
1329
1330static VALUE
1331rb_str_new_frozen_String(VALUE orig)
1332{
1333 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1334 return str_new_frozen(rb_cString, orig);
1335}
1336
1337VALUE
1338rb_str_tmp_frozen_acquire(VALUE orig)
1339{
1340 if (OBJ_FROZEN_RAW(orig)) return orig;
1341 return str_new_frozen_buffer(0, orig, FALSE);
1342}
1343
1344VALUE
1345rb_str_tmp_frozen_no_embed_acquire(VALUE orig)
1346{
1347 if (OBJ_FROZEN_RAW(orig) && !STR_EMBED_P(orig) && !rb_str_reembeddable_p(orig)) return orig;
1348 if (STR_SHARED_P(orig) && !STR_EMBED_P(RSTRING(orig)->as.heap.aux.shared)) return rb_str_tmp_frozen_acquire(orig);
1349
1350 VALUE str = str_alloc_heap(0);
1351 OBJ_FREEZE(str);
1352 /* Always set the STR_SHARED_ROOT to ensure it does not get re-embedded. */
1353 FL_SET(str, STR_SHARED_ROOT);
1354
1355 size_t capa = str_capacity(orig, TERM_LEN(orig));
1356
1357 /* If the string is embedded then we want to create a copy that is heap
1358 * allocated. If the string is shared then the shared root must be
1359 * embedded, so we want to create a copy. If the string is a shared root
1360 * then it must be embedded, so we want to create a copy. */
1361 if (STR_EMBED_P(orig) || FL_TEST_RAW(orig, STR_SHARED | STR_SHARED_ROOT)) {
1362 RSTRING(str)->as.heap.ptr = rb_xmalloc_mul_add_mul(sizeof(char), capa, sizeof(char), TERM_LEN(orig));
1363 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), capa);
1364 }
1365 else {
1366 /* orig must be heap allocated and not shared, so we can safely transfer
1367 * the pointer to str. */
1368 RSTRING(str)->as.heap.ptr = RSTRING(orig)->as.heap.ptr;
1369 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1370 RBASIC(orig)->flags &= ~STR_NOFREE;
1371 STR_SET_SHARED(orig, str);
1372 }
1373
1374 RSTRING(str)->len = RSTRING(orig)->len;
1375 RSTRING(str)->as.heap.aux.capa = capa;
1376
1377 return str;
1378}
1379
1380void
1381rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1382{
1383 if (RBASIC_CLASS(tmp) != 0)
1384 return;
1385
1386 if (STR_EMBED_P(tmp)) {
1387 assert(OBJ_FROZEN_RAW(tmp));
1388 }
1389 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1390 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1391 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1392
1393 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1394 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1395 assert(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1396
1397 /* Unshare orig since the root (tmp) only has this one child. */
1398 FL_UNSET_RAW(orig, STR_SHARED);
1399 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1400 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1401 assert(OBJ_FROZEN_RAW(tmp));
1402
1403 /* Make tmp embedded and empty so it is safe for sweeping. */
1404 STR_SET_EMBED(tmp);
1405 STR_SET_LEN(tmp, 0);
1406 }
1407 }
1408}
1409
1410static VALUE
1411str_new_frozen(VALUE klass, VALUE orig)
1412{
1413 return str_new_frozen_buffer(klass, orig, TRUE);
1414}
1415
1416static VALUE
1417heap_str_make_shared(VALUE klass, VALUE orig)
1418{
1419 assert(!STR_EMBED_P(orig));
1420 assert(!STR_SHARED_P(orig));
1421
1422 VALUE str = str_alloc_heap(klass);
1423 STR_SET_LEN(str, RSTRING_LEN(orig));
1424 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1425 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1426 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1427 RBASIC(orig)->flags &= ~STR_NOFREE;
1428 STR_SET_SHARED(orig, str);
1429 if (klass == 0)
1430 FL_UNSET_RAW(str, STR_BORROWED);
1431 return str;
1432}
1433
1434static VALUE
1435str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1436{
1437 VALUE str;
1438
1439 long len = RSTRING_LEN(orig);
1440 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1441
1442 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1443 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1444 assert(STR_EMBED_P(str));
1445 }
1446 else {
1447 if (FL_TEST_RAW(orig, STR_SHARED)) {
1448 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1449 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1450 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1451 assert(ofs >= 0);
1452 assert(rest >= 0);
1453 assert(ofs + rest <= RSTRING_LEN(shared));
1454 assert(OBJ_FROZEN(shared));
1455
1456 if ((ofs > 0) || (rest > 0) ||
1457 (klass != RBASIC(shared)->klass) ||
1458 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1459 str = str_new_shared(klass, shared);
1460 assert(!STR_EMBED_P(str));
1461 RSTRING(str)->as.heap.ptr += ofs;
1462 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1463 }
1464 else {
1465 if (RBASIC_CLASS(shared) == 0)
1466 FL_SET_RAW(shared, STR_BORROWED);
1467 return shared;
1468 }
1469 }
1470 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1471 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1472 STR_SET_EMBED(str);
1473 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1474 STR_SET_LEN(str, RSTRING_LEN(orig));
1475 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1476 }
1477 else {
1478 str = heap_str_make_shared(klass, orig);
1479 }
1480 }
1481
1482 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1483 OBJ_FREEZE(str);
1484 return str;
1485}
1486
1487VALUE
1488rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1489{
1490 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1491}
1492
1493static VALUE
1494str_new_empty_String(VALUE str)
1495{
1496 VALUE v = rb_str_new(0, 0);
1497 rb_enc_copy(v, str);
1498 return v;
1499}
1500
1501#define STR_BUF_MIN_SIZE 63
1502
1503VALUE
1504rb_str_buf_new(long capa)
1505{
1506 if (STR_EMBEDDABLE_P(capa, 1)) {
1507 return str_alloc_embed(rb_cString, capa + 1);
1508 }
1509
1510 VALUE str = str_alloc_heap(rb_cString);
1511
1512 RSTRING(str)->as.heap.aux.capa = capa;
1513 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1514 RSTRING(str)->as.heap.ptr[0] = '\0';
1515
1516 return str;
1517}
1518
1519VALUE
1521{
1522 VALUE str;
1523 long len = strlen(ptr);
1524
1525 str = rb_str_buf_new(len);
1526 rb_str_buf_cat(str, ptr, len);
1527
1528 return str;
1529}
1530
1531VALUE
1533{
1534 return str_new(0, 0, len);
1535}
1536
1537void
1539{
1540 if (FL_TEST(str, RSTRING_FSTR)) {
1541 st_data_t fstr = (st_data_t)str;
1542
1543 RB_VM_LOCK_ENTER();
1544 {
1545 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1546 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1547 }
1548 RB_VM_LOCK_LEAVE();
1549 }
1550
1551 if (STR_EMBED_P(str)) {
1552 RB_DEBUG_COUNTER_INC(obj_str_embed);
1553 }
1554 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1555 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1556 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1557 }
1558 else {
1559 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1560 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1561 }
1562}
1563
1564RUBY_FUNC_EXPORTED size_t
1565rb_str_memsize(VALUE str)
1566{
1567 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1568 return STR_HEAP_SIZE(str);
1569 }
1570 else {
1571 return 0;
1572 }
1573}
1574
1575VALUE
1577{
1578 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1579}
1580
1581static inline void str_discard(VALUE str);
1582static void str_shared_replace(VALUE str, VALUE str2);
1583
1584void
1586{
1587 if (str != str2) str_shared_replace(str, str2);
1588}
1589
1590static void
1591str_shared_replace(VALUE str, VALUE str2)
1592{
1593 rb_encoding *enc;
1594 int cr;
1595 int termlen;
1596
1597 RUBY_ASSERT(str2 != str);
1598 enc = STR_ENC_GET(str2);
1599 cr = ENC_CODERANGE(str2);
1600 str_discard(str);
1601 termlen = rb_enc_mbminlen(enc);
1602
1603 STR_SET_LEN(str, RSTRING_LEN(str2));
1604
1605 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1606 STR_SET_EMBED(str);
1607 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1608 rb_enc_associate(str, enc);
1609 ENC_CODERANGE_SET(str, cr);
1610 }
1611 else {
1612 if (STR_EMBED_P(str2)) {
1613 assert(!FL_TEST(str2, STR_SHARED));
1614 long len = RSTRING_LEN(str2);
1615 assert(len + termlen <= str_embed_capa(str2));
1616
1617 char *new_ptr = ALLOC_N(char, len + termlen);
1618 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1619 RSTRING(str2)->as.heap.ptr = new_ptr;
1620 STR_SET_LEN(str2, len);
1621 RSTRING(str2)->as.heap.aux.capa = len;
1622 STR_SET_NOEMBED(str2);
1623 }
1624
1625 STR_SET_NOEMBED(str);
1626 FL_UNSET(str, STR_SHARED);
1627 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1628
1629 if (FL_TEST(str2, STR_SHARED)) {
1630 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1631 STR_SET_SHARED(str, shared);
1632 }
1633 else {
1634 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1635 }
1636
1637 /* abandon str2 */
1638 STR_SET_EMBED(str2);
1639 RSTRING_PTR(str2)[0] = 0;
1640 STR_SET_LEN(str2, 0);
1641 rb_enc_associate(str, enc);
1642 ENC_CODERANGE_SET(str, cr);
1643 }
1644}
1645
1646VALUE
1647rb_obj_as_string(VALUE obj)
1648{
1649 VALUE str;
1650
1651 if (RB_TYPE_P(obj, T_STRING)) {
1652 return obj;
1653 }
1654 str = rb_funcall(obj, idTo_s, 0);
1655 return rb_obj_as_string_result(str, obj);
1656}
1657
1658VALUE
1659rb_obj_as_string_result(VALUE str, VALUE obj)
1660{
1661 if (!RB_TYPE_P(str, T_STRING))
1662 return rb_any_to_s(obj);
1663 return str;
1664}
1665
1666static VALUE
1667str_replace(VALUE str, VALUE str2)
1668{
1669 long len;
1670
1671 len = RSTRING_LEN(str2);
1672 if (STR_SHARED_P(str2)) {
1673 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1674 assert(OBJ_FROZEN(shared));
1675 STR_SET_NOEMBED(str);
1676 STR_SET_LEN(str, len);
1677 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1678 STR_SET_SHARED(str, shared);
1679 rb_enc_cr_str_exact_copy(str, str2);
1680 }
1681 else {
1682 str_replace_shared(str, str2);
1683 }
1684
1685 return str;
1686}
1687
1688static inline VALUE
1689ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1690{
1691 size_t size = rb_str_embed_size(capa);
1692 assert(size > 0);
1693 assert(rb_gc_size_allocatable_p(size));
1694
1695 NEWOBJ_OF(str, struct RString, klass,
1697
1698 return (VALUE)str;
1699}
1700
1701static inline VALUE
1702ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1703{
1704 NEWOBJ_OF(str, struct RString, klass,
1705 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1706
1707 return (VALUE)str;
1708}
1709
1710static inline VALUE
1711str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1712{
1713 const VALUE flag_mask =
1715 FL_FREEZE
1716 ;
1717 VALUE flags = FL_TEST_RAW(str, flag_mask);
1718 int encidx = 0;
1719 if (STR_EMBED_P(str)) {
1720 long len = RSTRING_LEN(str);
1721
1722 assert(STR_EMBED_P(dup));
1723 assert(str_embed_capa(dup) >= len + 1);
1724 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1725 }
1726 else {
1727 VALUE root = str;
1728 if (FL_TEST_RAW(str, STR_SHARED)) {
1729 root = RSTRING(str)->as.heap.aux.shared;
1730 }
1731 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1732 root = str = str_new_frozen(klass, str);
1733 flags = FL_TEST_RAW(str, flag_mask);
1734 }
1735 assert(!STR_SHARED_P(root));
1736 assert(RB_OBJ_FROZEN_RAW(root));
1737
1738 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1739 FL_SET(root, STR_SHARED_ROOT);
1740 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1741 flags |= RSTRING_NOEMBED | STR_SHARED;
1742 }
1743
1744 STR_SET_LEN(dup, RSTRING_LEN(str));
1745
1746 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1747 encidx = rb_enc_get_index(str);
1748 flags &= ~ENCODING_MASK;
1749 }
1750 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1751 if (encidx) rb_enc_associate_index(dup, encidx);
1752 return dup;
1753}
1754
1755static inline VALUE
1756ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1757{
1758 VALUE dup;
1759 if (STR_EMBED_P(str)) {
1760 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1761 }
1762 else {
1763 dup = ec_str_alloc_heap(ec, klass);
1764 }
1765
1766 return str_duplicate_setup(klass, str, dup);
1767}
1768
1769static inline VALUE
1770str_duplicate(VALUE klass, VALUE str)
1771{
1772 VALUE dup;
1773 if (STR_EMBED_P(str)) {
1774 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1775 }
1776 else {
1777 dup = str_alloc_heap(klass);
1778 }
1779
1780 return str_duplicate_setup(klass, str, dup);
1781}
1782
1783VALUE
1784rb_str_dup(VALUE str)
1785{
1786 return str_duplicate(rb_obj_class(str), str);
1787}
1788
1789/* :nodoc: */
1790VALUE
1791rb_str_dup_m(VALUE str)
1792{
1793 if (LIKELY(BARE_STRING_P(str))) {
1794 return str_duplicate(rb_obj_class(str), str);
1795 }
1796 else {
1797 return rb_obj_dup(str);
1798 }
1799}
1800
1801VALUE
1803{
1804 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1805 return str_duplicate(rb_cString, str);
1806}
1807
1808VALUE
1809rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1810{
1811 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1812 return ec_str_duplicate(ec, rb_cString, str);
1813}
1814
1815/*
1816 *
1817 * call-seq:
1818 * String.new(string = '', **opts) -> new_string
1819 *
1820 * :include: doc/string/new.rdoc
1821 *
1822 */
1823
1824static VALUE
1825rb_str_init(int argc, VALUE *argv, VALUE str)
1826{
1827 static ID keyword_ids[2];
1828 VALUE orig, opt, venc, vcapa;
1829 VALUE kwargs[2];
1830 rb_encoding *enc = 0;
1831 int n;
1832
1833 if (!keyword_ids[0]) {
1834 keyword_ids[0] = rb_id_encoding();
1835 CONST_ID(keyword_ids[1], "capacity");
1836 }
1837
1838 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1839 if (!NIL_P(opt)) {
1840 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1841 venc = kwargs[0];
1842 vcapa = kwargs[1];
1843 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1844 enc = rb_to_encoding(venc);
1845 }
1846 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1847 long capa = NUM2LONG(vcapa);
1848 long len = 0;
1849 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1850
1851 if (capa < STR_BUF_MIN_SIZE) {
1852 capa = STR_BUF_MIN_SIZE;
1853 }
1854 if (n == 1) {
1855 StringValue(orig);
1856 len = RSTRING_LEN(orig);
1857 if (capa < len) {
1858 capa = len;
1859 }
1860 if (orig == str) n = 0;
1861 }
1862 str_modifiable(str);
1863 if (STR_EMBED_P(str) || FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1864 /* make noembed always */
1865 const size_t size = (size_t)capa + termlen;
1866 const char *const old_ptr = RSTRING_PTR(str);
1867 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1868 char *new_ptr = ALLOC_N(char, size);
1869 if (STR_EMBED_P(str)) RUBY_ASSERT(osize <= str_embed_capa(str));
1870 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1871 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1872 RSTRING(str)->as.heap.ptr = new_ptr;
1873 }
1874 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1875 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1876 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1877 }
1878 STR_SET_LEN(str, len);
1879 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1880 if (n == 1) {
1881 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1882 rb_enc_cr_str_exact_copy(str, orig);
1883 }
1884 FL_SET(str, STR_NOEMBED);
1885 RSTRING(str)->as.heap.aux.capa = capa;
1886 }
1887 else if (n == 1) {
1888 rb_str_replace(str, orig);
1889 }
1890 if (enc) {
1891 rb_enc_associate(str, enc);
1893 }
1894 }
1895 else if (n == 1) {
1896 rb_str_replace(str, orig);
1897 }
1898 return str;
1899}
1900
1901/* :nodoc: */
1902static VALUE
1903rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1904{
1905 if (klass != rb_cString) {
1906 return rb_class_new_instance_pass_kw(argc, argv, klass);
1907 }
1908
1909 static ID keyword_ids[2];
1910 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1911 VALUE kwargs[2];
1912 rb_encoding *enc = NULL;
1913
1914 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1915 if (NIL_P(opt)) {
1916 return rb_class_new_instance_pass_kw(argc, argv, klass);
1917 }
1918
1919 keyword_ids[0] = rb_id_encoding();
1920 CONST_ID(keyword_ids[1], "capacity");
1921 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1922 encoding = kwargs[0];
1923 capacity = kwargs[1];
1924
1925 int termlen = 1;
1926
1927 if (n == 1) {
1928 orig = StringValue(orig);
1929 }
1930 else {
1931 orig = Qnil;
1932 }
1933
1934 if (UNDEF_P(encoding)) {
1935 if (!NIL_P(orig)) {
1936 encoding = rb_obj_encoding(orig);
1937 }
1938 }
1939
1940 if (!UNDEF_P(encoding)) {
1941 enc = rb_to_encoding(encoding);
1942 termlen = rb_enc_mbminlen(enc);
1943 }
1944
1945 // If capacity is nil, we're basically just duping `orig`.
1946 if (UNDEF_P(capacity)) {
1947 if (NIL_P(orig)) {
1948 VALUE empty_str = str_new(klass, "", 0);
1949 if (enc) {
1950 rb_enc_associate(empty_str, enc);
1951 }
1952 return empty_str;
1953 }
1954 VALUE copy = str_duplicate(klass, orig);
1955 rb_enc_associate(copy, enc);
1956 ENC_CODERANGE_CLEAR(copy);
1957 return copy;
1958 }
1959
1960 long capa = 0;
1961 capa = NUM2LONG(capacity);
1962 if (capa < 0) {
1963 capa = 0;
1964 }
1965
1966 if (!NIL_P(orig)) {
1967 long orig_capa = rb_str_capacity(orig);
1968 if (orig_capa > capa) {
1969 capa = orig_capa;
1970 }
1971 }
1972
1973 long fake_len = capa - termlen;
1974 if (fake_len < 0) {
1975 fake_len = 0;
1976 }
1977
1978 VALUE str = str_new0(klass, NULL, fake_len, termlen);
1979 STR_SET_LEN(str, 0);
1980 TERM_FILL(RSTRING_PTR(str), termlen);
1981
1982 if (enc) {
1983 rb_enc_associate(str, enc);
1984 }
1985
1986 if (!NIL_P(orig)) {
1987 rb_str_buf_append(str, orig);
1988 }
1989
1990 return str;
1991}
1992
1993#ifdef NONASCII_MASK
1994#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1995
1996/*
1997 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1998 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1999 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
2000 *
2001 * if (!(byte & 0x80))
2002 * byte |= 0x40; // turn on bit6
2003 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
2004 *
2005 * This function calculates whether a byte is leading or not for all bytes
2006 * in the argument word by concurrently using the above logic, and then
2007 * adds up the number of leading bytes in the word.
2008 */
2009static inline uintptr_t
2010count_utf8_lead_bytes_with_word(const uintptr_t *s)
2011{
2012 uintptr_t d = *s;
2013
2014 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
2015 d = (d>>6) | (~d>>7);
2016 d &= NONASCII_MASK >> 7;
2017
2018 /* Gather all bytes. */
2019#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
2020 /* use only if it can use POPCNT */
2021 return rb_popcount_intptr(d);
2022#else
2023 d += (d>>8);
2024 d += (d>>16);
2025# if SIZEOF_VOIDP == 8
2026 d += (d>>32);
2027# endif
2028 return (d&0xF);
2029#endif
2030}
2031#endif
2032
2033static inline long
2034enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2035{
2036 long c;
2037 const char *q;
2038
2039 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2040 long diff = (long)(e - p);
2041 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2042 }
2043#ifdef NONASCII_MASK
2044 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2045 uintptr_t len = 0;
2046 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2047 const uintptr_t *s, *t;
2048 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2049 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2050 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2051 while (p < (const char *)s) {
2052 if (is_utf8_lead_byte(*p)) len++;
2053 p++;
2054 }
2055 while (s < t) {
2056 len += count_utf8_lead_bytes_with_word(s);
2057 s++;
2058 }
2059 p = (const char *)s;
2060 }
2061 while (p < e) {
2062 if (is_utf8_lead_byte(*p)) len++;
2063 p++;
2064 }
2065 return (long)len;
2066 }
2067#endif
2068 else if (rb_enc_asciicompat(enc)) {
2069 c = 0;
2070 if (ENC_CODERANGE_CLEAN_P(cr)) {
2071 while (p < e) {
2072 if (ISASCII(*p)) {
2073 q = search_nonascii(p, e);
2074 if (!q)
2075 return c + (e - p);
2076 c += q - p;
2077 p = q;
2078 }
2079 p += rb_enc_fast_mbclen(p, e, enc);
2080 c++;
2081 }
2082 }
2083 else {
2084 while (p < e) {
2085 if (ISASCII(*p)) {
2086 q = search_nonascii(p, e);
2087 if (!q)
2088 return c + (e - p);
2089 c += q - p;
2090 p = q;
2091 }
2092 p += rb_enc_mbclen(p, e, enc);
2093 c++;
2094 }
2095 }
2096 return c;
2097 }
2098
2099 for (c=0; p<e; c++) {
2100 p += rb_enc_mbclen(p, e, enc);
2101 }
2102 return c;
2103}
2104
2105long
2106rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2107{
2108 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2109}
2110
2111/* To get strlen with cr
2112 * Note that given cr is not used.
2113 */
2114long
2115rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2116{
2117 long c;
2118 const char *q;
2119 int ret;
2120
2121 *cr = 0;
2122 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2123 long diff = (long)(e - p);
2124 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2125 }
2126 else if (rb_enc_asciicompat(enc)) {
2127 c = 0;
2128 while (p < e) {
2129 if (ISASCII(*p)) {
2130 q = search_nonascii(p, e);
2131 if (!q) {
2132 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2133 return c + (e - p);
2134 }
2135 c += q - p;
2136 p = q;
2137 }
2138 ret = rb_enc_precise_mbclen(p, e, enc);
2139 if (MBCLEN_CHARFOUND_P(ret)) {
2140 *cr |= ENC_CODERANGE_VALID;
2141 p += MBCLEN_CHARFOUND_LEN(ret);
2142 }
2143 else {
2145 p++;
2146 }
2147 c++;
2148 }
2149 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2150 return c;
2151 }
2152
2153 for (c=0; p<e; c++) {
2154 ret = rb_enc_precise_mbclen(p, e, enc);
2155 if (MBCLEN_CHARFOUND_P(ret)) {
2156 *cr |= ENC_CODERANGE_VALID;
2157 p += MBCLEN_CHARFOUND_LEN(ret);
2158 }
2159 else {
2161 if (p + rb_enc_mbminlen(enc) <= e)
2162 p += rb_enc_mbminlen(enc);
2163 else
2164 p = e;
2165 }
2166 }
2167 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2168 return c;
2169}
2170
2171/* enc must be str's enc or rb_enc_check(str, str2) */
2172static long
2173str_strlen(VALUE str, rb_encoding *enc)
2174{
2175 const char *p, *e;
2176 int cr;
2177
2178 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2179 if (!enc) enc = STR_ENC_GET(str);
2180 p = RSTRING_PTR(str);
2181 e = RSTRING_END(str);
2182 cr = ENC_CODERANGE(str);
2183
2184 if (cr == ENC_CODERANGE_UNKNOWN) {
2185 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2186 if (cr) ENC_CODERANGE_SET(str, cr);
2187 return n;
2188 }
2189 else {
2190 return enc_strlen(p, e, enc, cr);
2191 }
2192}
2193
2194long
2196{
2197 return str_strlen(str, NULL);
2198}
2199
2200/*
2201 * call-seq:
2202 * length -> integer
2203 *
2204 * :include: doc/string/length.rdoc
2205 *
2206 */
2207
2208VALUE
2210{
2211 return LONG2NUM(str_strlen(str, NULL));
2212}
2213
2214/*
2215 * call-seq:
2216 * bytesize -> integer
2217 *
2218 * :include: doc/string/bytesize.rdoc
2219 *
2220 */
2221
2222VALUE
2223rb_str_bytesize(VALUE str)
2224{
2225 return LONG2NUM(RSTRING_LEN(str));
2226}
2227
2228/*
2229 * call-seq:
2230 * empty? -> true or false
2231 *
2232 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2233 *
2234 * "hello".empty? # => false
2235 * " ".empty? # => false
2236 * "".empty? # => true
2237 *
2238 */
2239
2240static VALUE
2241rb_str_empty(VALUE str)
2242{
2243 return RBOOL(RSTRING_LEN(str) == 0);
2244}
2245
2246/*
2247 * call-seq:
2248 * string + other_string -> new_string
2249 *
2250 * Returns a new \String containing +other_string+ concatenated to +self+:
2251 *
2252 * "Hello from " + self.to_s # => "Hello from main"
2253 *
2254 */
2255
2256VALUE
2258{
2259 VALUE str3;
2260 rb_encoding *enc;
2261 char *ptr1, *ptr2, *ptr3;
2262 long len1, len2;
2263 int termlen;
2264
2265 StringValue(str2);
2266 enc = rb_enc_check_str(str1, str2);
2267 RSTRING_GETMEM(str1, ptr1, len1);
2268 RSTRING_GETMEM(str2, ptr2, len2);
2269 termlen = rb_enc_mbminlen(enc);
2270 if (len1 > LONG_MAX - len2) {
2271 rb_raise(rb_eArgError, "string size too big");
2272 }
2273 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2274 ptr3 = RSTRING_PTR(str3);
2275 memcpy(ptr3, ptr1, len1);
2276 memcpy(ptr3+len1, ptr2, len2);
2277 TERM_FILL(&ptr3[len1+len2], termlen);
2278
2279 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2281 RB_GC_GUARD(str1);
2282 RB_GC_GUARD(str2);
2283 return str3;
2284}
2285
2286/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2287VALUE
2288rb_str_opt_plus(VALUE str1, VALUE str2)
2289{
2290 assert(RBASIC_CLASS(str1) == rb_cString);
2291 assert(RBASIC_CLASS(str2) == rb_cString);
2292 long len1, len2;
2293 MAYBE_UNUSED(char) *ptr1, *ptr2;
2294 RSTRING_GETMEM(str1, ptr1, len1);
2295 RSTRING_GETMEM(str2, ptr2, len2);
2296 int enc1 = rb_enc_get_index(str1);
2297 int enc2 = rb_enc_get_index(str2);
2298
2299 if (enc1 < 0) {
2300 return Qundef;
2301 }
2302 else if (enc2 < 0) {
2303 return Qundef;
2304 }
2305 else if (enc1 != enc2) {
2306 return Qundef;
2307 }
2308 else if (len1 > LONG_MAX - len2) {
2309 return Qundef;
2310 }
2311 else {
2312 return rb_str_plus(str1, str2);
2313 }
2314
2315}
2316
2317/*
2318 * call-seq:
2319 * string * integer -> new_string
2320 *
2321 * Returns a new \String containing +integer+ copies of +self+:
2322 *
2323 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2324 * "Ho! " * 0 # => ""
2325 *
2326 */
2327
2328VALUE
2330{
2331 VALUE str2;
2332 long n, len;
2333 char *ptr2;
2334 int termlen;
2335
2336 if (times == INT2FIX(1)) {
2337 return str_duplicate(rb_cString, str);
2338 }
2339 if (times == INT2FIX(0)) {
2340 str2 = str_alloc_embed(rb_cString, 0);
2341 rb_enc_copy(str2, str);
2342 return str2;
2343 }
2344 len = NUM2LONG(times);
2345 if (len < 0) {
2346 rb_raise(rb_eArgError, "negative argument");
2347 }
2348 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2349 if (STR_EMBEDDABLE_P(len, 1)) {
2350 str2 = str_alloc_embed(rb_cString, len + 1);
2351 memset(RSTRING_PTR(str2), 0, len + 1);
2352 }
2353 else {
2354 str2 = str_alloc_heap(rb_cString);
2355 RSTRING(str2)->as.heap.aux.capa = len;
2356 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2357 }
2358 STR_SET_LEN(str2, len);
2359 rb_enc_copy(str2, str);
2360 return str2;
2361 }
2362 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2363 rb_raise(rb_eArgError, "argument too big");
2364 }
2365
2366 len *= RSTRING_LEN(str);
2367 termlen = TERM_LEN(str);
2368 str2 = str_new0(rb_cString, 0, len, termlen);
2369 ptr2 = RSTRING_PTR(str2);
2370 if (len) {
2371 n = RSTRING_LEN(str);
2372 memcpy(ptr2, RSTRING_PTR(str), n);
2373 while (n <= len/2) {
2374 memcpy(ptr2 + n, ptr2, n);
2375 n *= 2;
2376 }
2377 memcpy(ptr2 + n, ptr2, len-n);
2378 }
2379 STR_SET_LEN(str2, len);
2380 TERM_FILL(&ptr2[len], termlen);
2381 rb_enc_cr_str_copy_for_substr(str2, str);
2382
2383 return str2;
2384}
2385
2386/*
2387 * call-seq:
2388 * string % object -> new_string
2389 *
2390 * Returns the result of formatting +object+ into the format specification +self+
2391 * (see Kernel#sprintf for formatting details):
2392 *
2393 * "%05d" % 123 # => "00123"
2394 *
2395 * If +self+ contains multiple substitutions, +object+ must be
2396 * an Array or Hash containing the values to be substituted:
2397 *
2398 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2399 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2400 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2401 *
2402 */
2403
2404static VALUE
2405rb_str_format_m(VALUE str, VALUE arg)
2406{
2407 VALUE tmp = rb_check_array_type(arg);
2408
2409 if (!NIL_P(tmp)) {
2410 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2411 }
2412 return rb_str_format(1, &arg, str);
2413}
2414
2415static inline void
2416rb_check_lockedtmp(VALUE str)
2417{
2418 if (FL_TEST(str, STR_TMPLOCK)) {
2419 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2420 }
2421}
2422
2423static inline void
2424str_modifiable(VALUE str)
2425{
2426 rb_check_lockedtmp(str);
2427 rb_check_frozen(str);
2428}
2429
2430static inline int
2431str_dependent_p(VALUE str)
2432{
2433 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2434 return 0;
2435 }
2436 else {
2437 return 1;
2438 }
2439}
2440
2441static inline int
2442str_independent(VALUE str)
2443{
2444 str_modifiable(str);
2445 return !str_dependent_p(str);
2446}
2447
2448static void
2449str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2450{
2451 char *ptr;
2452 char *oldptr;
2453 long capa = len + expand;
2454
2455 if (len > capa) len = capa;
2456
2457 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2458 ptr = RSTRING(str)->as.heap.ptr;
2459 STR_SET_EMBED(str);
2460 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2461 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2462 STR_SET_LEN(str, len);
2463 return;
2464 }
2465
2466 ptr = ALLOC_N(char, (size_t)capa + termlen);
2467 oldptr = RSTRING_PTR(str);
2468 if (oldptr) {
2469 memcpy(ptr, oldptr, len);
2470 }
2471 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2472 xfree(oldptr);
2473 }
2474 STR_SET_NOEMBED(str);
2475 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2476 TERM_FILL(ptr + len, termlen);
2477 RSTRING(str)->as.heap.ptr = ptr;
2478 STR_SET_LEN(str, len);
2479 RSTRING(str)->as.heap.aux.capa = capa;
2480}
2481
2482void
2483rb_str_modify(VALUE str)
2484{
2485 if (!str_independent(str))
2486 str_make_independent(str);
2488}
2489
2490void
2492{
2493 int termlen = TERM_LEN(str);
2494 long len = RSTRING_LEN(str);
2495
2496 if (expand < 0) {
2497 rb_raise(rb_eArgError, "negative expanding string size");
2498 }
2499 if (expand >= LONG_MAX - len) {
2500 rb_raise(rb_eArgError, "string size too big");
2501 }
2502
2503 if (!str_independent(str)) {
2504 str_make_independent_expand(str, len, expand, termlen);
2505 }
2506 else if (expand > 0) {
2507 RESIZE_CAPA_TERM(str, len + expand, termlen);
2508 }
2510}
2511
2512/* As rb_str_modify(), but don't clear coderange */
2513static void
2514str_modify_keep_cr(VALUE str)
2515{
2516 if (!str_independent(str))
2517 str_make_independent(str);
2519 /* Force re-scan later */
2521}
2522
2523static inline void
2524str_discard(VALUE str)
2525{
2526 str_modifiable(str);
2527 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2528 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2529 RSTRING(str)->as.heap.ptr = 0;
2530 STR_SET_LEN(str, 0);
2531 }
2532}
2533
2534void
2536{
2537 rb_encoding *enc = rb_enc_get(str);
2538 if (!enc) {
2539 rb_raise(rb_eTypeError, "not encoding capable object");
2540 }
2541 if (!rb_enc_asciicompat(enc)) {
2542 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2543 }
2544}
2545
2546VALUE
2548{
2549 VALUE s = *ptr;
2550 if (!RB_TYPE_P(s, T_STRING)) {
2551 s = rb_str_to_str(s);
2552 *ptr = s;
2553 }
2554 return s;
2555}
2556
2557char *
2559{
2560 VALUE str = rb_string_value(ptr);
2561 return RSTRING_PTR(str);
2562}
2563
2564static int
2565zero_filled(const char *s, int n)
2566{
2567 for (; n > 0; --n) {
2568 if (*s++) return 0;
2569 }
2570 return 1;
2571}
2572
2573static const char *
2574str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2575{
2576 const char *e = s + len;
2577
2578 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2579 if (zero_filled(s, minlen)) return s;
2580 }
2581 return 0;
2582}
2583
2584static char *
2585str_fill_term(VALUE str, char *s, long len, int termlen)
2586{
2587 /* This function assumes that (capa + termlen) bytes of memory
2588 * is allocated, like many other functions in this file.
2589 */
2590 if (str_dependent_p(str)) {
2591 if (!zero_filled(s + len, termlen))
2592 str_make_independent_expand(str, len, 0L, termlen);
2593 }
2594 else {
2595 TERM_FILL(s + len, termlen);
2596 return s;
2597 }
2598 return RSTRING_PTR(str);
2599}
2600
2601void
2602rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2603{
2604 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2605 long len = RSTRING_LEN(str);
2606
2607 assert(capa >= len);
2608 if (capa - len < termlen) {
2609 rb_check_lockedtmp(str);
2610 str_make_independent_expand(str, len, 0L, termlen);
2611 }
2612 else if (str_dependent_p(str)) {
2613 if (termlen > oldtermlen)
2614 str_make_independent_expand(str, len, 0L, termlen);
2615 }
2616 else {
2617 if (!STR_EMBED_P(str)) {
2618 /* modify capa instead of realloc */
2619 assert(!FL_TEST((str), STR_SHARED));
2620 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2621 }
2622 if (termlen > oldtermlen) {
2623 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2624 }
2625 }
2626
2627 return;
2628}
2629
2630static char *
2631str_null_check(VALUE str, int *w)
2632{
2633 char *s = RSTRING_PTR(str);
2634 long len = RSTRING_LEN(str);
2635 rb_encoding *enc = rb_enc_get(str);
2636 const int minlen = rb_enc_mbminlen(enc);
2637
2638 if (minlen > 1) {
2639 *w = 1;
2640 if (str_null_char(s, len, minlen, enc)) {
2641 return NULL;
2642 }
2643 return str_fill_term(str, s, len, minlen);
2644 }
2645 *w = 0;
2646 if (!s || memchr(s, 0, len)) {
2647 return NULL;
2648 }
2649 if (s[len]) {
2650 s = str_fill_term(str, s, len, minlen);
2651 }
2652 return s;
2653}
2654
2655char *
2656rb_str_to_cstr(VALUE str)
2657{
2658 int w;
2659 return str_null_check(str, &w);
2660}
2661
2662char *
2664{
2665 VALUE str = rb_string_value(ptr);
2666 int w;
2667 char *s = str_null_check(str, &w);
2668 if (!s) {
2669 if (w) {
2670 rb_raise(rb_eArgError, "string contains null char");
2671 }
2672 rb_raise(rb_eArgError, "string contains null byte");
2673 }
2674 return s;
2675}
2676
2677char *
2678rb_str_fill_terminator(VALUE str, const int newminlen)
2679{
2680 char *s = RSTRING_PTR(str);
2681 long len = RSTRING_LEN(str);
2682 return str_fill_term(str, s, len, newminlen);
2683}
2684
2685VALUE
2687{
2688 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2689 return str;
2690}
2691
2692/*
2693 * call-seq:
2694 * String.try_convert(object) -> object, new_string, or nil
2695 *
2696 * If +object+ is a \String object, returns +object+.
2697 *
2698 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2699 * calls <tt>object.to_str</tt> and returns the result.
2700 *
2701 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2702 *
2703 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2704 */
2705static VALUE
2706rb_str_s_try_convert(VALUE dummy, VALUE str)
2707{
2708 return rb_check_string_type(str);
2709}
2710
2711static char*
2712str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2713{
2714 long nth = *nthp;
2715 if (rb_enc_mbmaxlen(enc) == 1) {
2716 p += nth;
2717 }
2718 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2719 p += nth * rb_enc_mbmaxlen(enc);
2720 }
2721 else if (rb_enc_asciicompat(enc)) {
2722 const char *p2, *e2;
2723 int n;
2724
2725 while (p < e && 0 < nth) {
2726 e2 = p + nth;
2727 if (e < e2) {
2728 *nthp = nth;
2729 return (char *)e;
2730 }
2731 if (ISASCII(*p)) {
2732 p2 = search_nonascii(p, e2);
2733 if (!p2) {
2734 nth -= e2 - p;
2735 *nthp = nth;
2736 return (char *)e2;
2737 }
2738 nth -= p2 - p;
2739 p = p2;
2740 }
2741 n = rb_enc_mbclen(p, e, enc);
2742 p += n;
2743 nth--;
2744 }
2745 *nthp = nth;
2746 if (nth != 0) {
2747 return (char *)e;
2748 }
2749 return (char *)p;
2750 }
2751 else {
2752 while (p < e && nth--) {
2753 p += rb_enc_mbclen(p, e, enc);
2754 }
2755 }
2756 if (p > e) p = e;
2757 *nthp = nth;
2758 return (char*)p;
2759}
2760
2761char*
2762rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2763{
2764 return str_nth_len(p, e, &nth, enc);
2765}
2766
2767static char*
2768str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2769{
2770 if (singlebyte)
2771 p += nth;
2772 else {
2773 p = str_nth_len(p, e, &nth, enc);
2774 }
2775 if (!p) return 0;
2776 if (p > e) p = e;
2777 return (char *)p;
2778}
2779
2780/* char offset to byte offset */
2781static long
2782str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2783{
2784 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2785 if (!pp) return e - p;
2786 return pp - p;
2787}
2788
2789long
2790rb_str_offset(VALUE str, long pos)
2791{
2792 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2793 STR_ENC_GET(str), single_byte_optimizable(str));
2794}
2795
2796#ifdef NONASCII_MASK
2797static char *
2798str_utf8_nth(const char *p, const char *e, long *nthp)
2799{
2800 long nth = *nthp;
2801 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2802 const uintptr_t *s, *t;
2803 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2804 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2805 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2806 while (p < (const char *)s) {
2807 if (is_utf8_lead_byte(*p)) nth--;
2808 p++;
2809 }
2810 do {
2811 nth -= count_utf8_lead_bytes_with_word(s);
2812 s++;
2813 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2814 p = (char *)s;
2815 }
2816 while (p < e) {
2817 if (is_utf8_lead_byte(*p)) {
2818 if (nth == 0) break;
2819 nth--;
2820 }
2821 p++;
2822 }
2823 *nthp = nth;
2824 return (char *)p;
2825}
2826
2827static long
2828str_utf8_offset(const char *p, const char *e, long nth)
2829{
2830 const char *pp = str_utf8_nth(p, e, &nth);
2831 return pp - p;
2832}
2833#endif
2834
2835/* byte offset to char offset */
2836long
2837rb_str_sublen(VALUE str, long pos)
2838{
2839 if (single_byte_optimizable(str) || pos < 0)
2840 return pos;
2841 else {
2842 char *p = RSTRING_PTR(str);
2843 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2844 }
2845}
2846
2847static VALUE
2848str_subseq(VALUE str, long beg, long len)
2849{
2850 VALUE str2;
2851
2852 assert(beg >= 0);
2853 assert(len >= 0);
2854 assert(beg+len <= RSTRING_LEN(str));
2855
2856 const int termlen = TERM_LEN(str);
2857 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2858 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2859 RB_GC_GUARD(str);
2860 return str2;
2861 }
2862
2863 str2 = str_alloc_heap(rb_cString);
2864 if (str_embed_capa(str2) >= len + termlen) {
2865 char *ptr2 = RSTRING(str2)->as.embed.ary;
2866 STR_SET_EMBED(str2);
2867 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2868 TERM_FILL(ptr2+len, termlen);
2869
2870 STR_SET_LEN(str2, len);
2871 RB_GC_GUARD(str);
2872 }
2873 else {
2874 str_replace_shared(str2, str);
2875 assert(!STR_EMBED_P(str2));
2876 ENC_CODERANGE_CLEAR(str2);
2877 RSTRING(str2)->as.heap.ptr += beg;
2878 if (RSTRING_LEN(str2) > len) {
2879 STR_SET_LEN(str2, len);
2880 }
2881 }
2882
2883 return str2;
2884}
2885
2886VALUE
2887rb_str_subseq(VALUE str, long beg, long len)
2888{
2889 VALUE str2 = str_subseq(str, beg, len);
2890 rb_enc_cr_str_copy_for_substr(str2, str);
2891 return str2;
2892}
2893
2894char *
2895rb_str_subpos(VALUE str, long beg, long *lenp)
2896{
2897 long len = *lenp;
2898 long slen = -1L;
2899 long blen = RSTRING_LEN(str);
2900 rb_encoding *enc = STR_ENC_GET(str);
2901 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2902
2903 if (len < 0) return 0;
2904 if (!blen) {
2905 len = 0;
2906 }
2907 if (single_byte_optimizable(str)) {
2908 if (beg > blen) return 0;
2909 if (beg < 0) {
2910 beg += blen;
2911 if (beg < 0) return 0;
2912 }
2913 if (len > blen - beg)
2914 len = blen - beg;
2915 if (len < 0) return 0;
2916 p = s + beg;
2917 goto end;
2918 }
2919 if (beg < 0) {
2920 if (len > -beg) len = -beg;
2921 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2922 beg = -beg;
2923 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2924 p = e;
2925 if (!p) return 0;
2926 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2927 if (!p) return 0;
2928 len = e - p;
2929 goto end;
2930 }
2931 else {
2932 slen = str_strlen(str, enc);
2933 beg += slen;
2934 if (beg < 0) return 0;
2935 p = s + beg;
2936 if (len == 0) goto end;
2937 }
2938 }
2939 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2940 return 0;
2941 }
2942 if (len == 0) {
2943 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2944 p = s + beg;
2945 }
2946#ifdef NONASCII_MASK
2947 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2948 enc == rb_utf8_encoding()) {
2949 p = str_utf8_nth(s, e, &beg);
2950 if (beg > 0) return 0;
2951 len = str_utf8_offset(p, e, len);
2952 }
2953#endif
2954 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2955 int char_sz = rb_enc_mbmaxlen(enc);
2956
2957 p = s + beg * char_sz;
2958 if (p > e) {
2959 return 0;
2960 }
2961 else if (len * char_sz > e - p)
2962 len = e - p;
2963 else
2964 len *= char_sz;
2965 }
2966 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2967 if (beg > 0) return 0;
2968 len = 0;
2969 }
2970 else {
2971 len = str_offset(p, e, len, enc, 0);
2972 }
2973 end:
2974 *lenp = len;
2975 RB_GC_GUARD(str);
2976 return p;
2977}
2978
2979static VALUE str_substr(VALUE str, long beg, long len, int empty);
2980
2981VALUE
2982rb_str_substr(VALUE str, long beg, long len)
2983{
2984 return str_substr(str, beg, len, TRUE);
2985}
2986
2987static VALUE
2988str_substr(VALUE str, long beg, long len, int empty)
2989{
2990 char *p = rb_str_subpos(str, beg, &len);
2991
2992 if (!p) return Qnil;
2993 if (!len && !empty) return Qnil;
2994
2995 beg = p - RSTRING_PTR(str);
2996
2997 VALUE str2 = str_subseq(str, beg, len);
2998 rb_enc_cr_str_copy_for_substr(str2, str);
2999 return str2;
3000}
3001
3002/* :nodoc: */
3003VALUE
3005{
3006 if (OBJ_FROZEN(str)) return str;
3007 rb_str_resize(str, RSTRING_LEN(str));
3008 return rb_obj_freeze(str);
3009}
3010
3011
3012/*
3013 * call-seq:
3014 * +string -> new_string or self
3015 *
3016 * Returns +self+ if +self+ is not frozen.
3017 *
3018 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
3019 */
3020static VALUE
3021str_uplus(VALUE str)
3022{
3023 if (OBJ_FROZEN(str)) {
3024 return rb_str_dup(str);
3025 }
3026 else {
3027 return str;
3028 }
3029}
3030
3031/*
3032 * call-seq:
3033 * -string -> frozen_string
3034 * dedup -> frozen_string
3035 *
3036 * Returns a frozen, possibly pre-existing copy of the string.
3037 *
3038 * The returned \String will be deduplicated as long as it does not have
3039 * any instance variables set on it and is not a String subclass.
3040 *
3041 * Note that <tt>-string</tt> variant is more convenient for defining
3042 * constants:
3043 *
3044 * FILENAME = -'config/database.yml'
3045 *
3046 * while +dedup+ is better suitable for using the method in chains
3047 * of calculations:
3048 *
3049 * @url_list.concat(urls.map(&:dedup))
3050 *
3051 */
3052static VALUE
3053str_uminus(VALUE str)
3054{
3055 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3056 str = rb_str_dup(str);
3057 }
3058 return rb_fstring(str);
3059}
3060
3061RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3062#define rb_str_dup_frozen rb_str_new_frozen
3063
3064VALUE
3066{
3067 if (FL_TEST(str, STR_TMPLOCK)) {
3068 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3069 }
3070 FL_SET(str, STR_TMPLOCK);
3071 return str;
3072}
3073
3074VALUE
3076{
3077 if (!FL_TEST(str, STR_TMPLOCK)) {
3078 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3079 }
3080 FL_UNSET(str, STR_TMPLOCK);
3081 return str;
3082}
3083
3084RUBY_FUNC_EXPORTED VALUE
3085rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3086{
3087 rb_str_locktmp(str);
3088 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3089}
3090
3091void
3092rb_str_set_len(VALUE str, long len)
3093{
3094 long capa;
3095 const int termlen = TERM_LEN(str);
3096
3097 str_modifiable(str);
3098 if (STR_SHARED_P(str)) {
3099 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3100 }
3101 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3102 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3103 }
3104
3105 int cr = ENC_CODERANGE(str);
3106 if (cr == ENC_CODERANGE_UNKNOWN) {
3107 /* Leave unknown. */
3108 }
3109 else if (len > RSTRING_LEN(str)) {
3110 if (ENC_CODERANGE_CLEAN_P(cr)) {
3111 /* Update the coderange regarding the extended part. */
3112 const char *const prev_end = RSTRING_END(str);
3113 const char *const new_end = RSTRING_PTR(str) + len;
3114 rb_encoding *enc = rb_enc_get(str);
3115 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3116 ENC_CODERANGE_SET(str, cr);
3117 }
3118 else if (cr == ENC_CODERANGE_BROKEN) {
3119 /* May be valid now, by appended part. */
3121 }
3122 }
3123 else if (len < RSTRING_LEN(str)) {
3124 if (cr != ENC_CODERANGE_7BIT) {
3125 /* ASCII-only string is keeping after truncated. Valid
3126 * and broken may be invalid or valid, leave unknown. */
3128 }
3129 }
3130
3131 STR_SET_LEN(str, len);
3132 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3133}
3134
3135VALUE
3136rb_str_resize(VALUE str, long len)
3137{
3138 if (len < 0) {
3139 rb_raise(rb_eArgError, "negative string size (or size too big)");
3140 }
3141
3142 int independent = str_independent(str);
3143 long slen = RSTRING_LEN(str);
3144
3145 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3147 }
3148
3149 {
3150 long capa;
3151 const int termlen = TERM_LEN(str);
3152 if (STR_EMBED_P(str)) {
3153 if (len == slen) return str;
3154 if (str_embed_capa(str) >= len + termlen) {
3155 STR_SET_LEN(str, len);
3156 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3157 return str;
3158 }
3159 str_make_independent_expand(str, slen, len - slen, termlen);
3160 }
3161 else if (str_embed_capa(str) >= len + termlen) {
3162 char *ptr = STR_HEAP_PTR(str);
3163 STR_SET_EMBED(str);
3164 if (slen > len) slen = len;
3165 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3166 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3167 STR_SET_LEN(str, len);
3168 if (independent) ruby_xfree(ptr);
3169 return str;
3170 }
3171 else if (!independent) {
3172 if (len == slen) return str;
3173 str_make_independent_expand(str, slen, len - slen, termlen);
3174 }
3175 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3176 (capa - len) > (len < 1024 ? len : 1024)) {
3177 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3178 (size_t)len + termlen, STR_HEAP_SIZE(str));
3179 RSTRING(str)->as.heap.aux.capa = len;
3180 }
3181 else if (len == slen) return str;
3182 STR_SET_LEN(str, len);
3183 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3184 }
3185 return str;
3186}
3187
3188static VALUE
3189str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3190{
3191 if (keep_cr) {
3192 str_modify_keep_cr(str);
3193 }
3194 else {
3195 rb_str_modify(str);
3196 }
3197 if (len == 0) return 0;
3198
3199 long total, olen, off = -1;
3200 char *sptr;
3201 const int termlen = TERM_LEN(str);
3202
3203 RSTRING_GETMEM(str, sptr, olen);
3204 if (ptr >= sptr && ptr <= sptr + olen) {
3205 off = ptr - sptr;
3206 }
3207
3208 long capa = str_capacity(str, termlen);
3209
3210 if (olen > LONG_MAX - len) {
3211 rb_raise(rb_eArgError, "string sizes too big");
3212 }
3213 total = olen + len;
3214 if (capa < total) {
3215 if (total >= LONG_MAX / 2) {
3216 capa = total;
3217 }
3218 while (total > capa) {
3219 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3220 }
3221 RESIZE_CAPA_TERM(str, capa, termlen);
3222 sptr = RSTRING_PTR(str);
3223 }
3224 if (off != -1) {
3225 ptr = sptr + off;
3226 }
3227 memcpy(sptr + olen, ptr, len);
3228 STR_SET_LEN(str, total);
3229 TERM_FILL(sptr + total, termlen); /* sentinel */
3230
3231 return str;
3232}
3233
3234#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3235#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3236
3237VALUE
3238rb_str_cat(VALUE str, const char *ptr, long len)
3239{
3240 if (len == 0) return str;
3241 if (len < 0) {
3242 rb_raise(rb_eArgError, "negative string size (or size too big)");
3243 }
3244 return str_buf_cat(str, ptr, len);
3245}
3246
3247VALUE
3248rb_str_cat_cstr(VALUE str, const char *ptr)
3249{
3250 must_not_null(ptr);
3251 return rb_str_buf_cat(str, ptr, strlen(ptr));
3252}
3253
3254RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3255RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3256RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3257
3258static VALUE
3259rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3260 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3261{
3262 int str_encindex = ENCODING_GET(str);
3263 int res_encindex;
3264 int str_cr, res_cr;
3265 rb_encoding *str_enc, *ptr_enc;
3266
3267 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3268
3269 if (str_encindex == ptr_encindex) {
3270 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3271 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3272 }
3273 }
3274 else {
3275 str_enc = rb_enc_from_index(str_encindex);
3276 ptr_enc = rb_enc_from_index(ptr_encindex);
3277 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3278 if (len == 0)
3279 return str;
3280 if (RSTRING_LEN(str) == 0) {
3281 rb_str_buf_cat(str, ptr, len);
3282 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3283 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3284 return str;
3285 }
3286 goto incompatible;
3287 }
3288 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3289 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3290 }
3291 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3292 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3293 str_cr = rb_enc_str_coderange(str);
3294 }
3295 }
3296 }
3297 if (ptr_cr_ret)
3298 *ptr_cr_ret = ptr_cr;
3299
3300 if (str_encindex != ptr_encindex &&
3301 str_cr != ENC_CODERANGE_7BIT &&
3302 ptr_cr != ENC_CODERANGE_7BIT) {
3303 str_enc = rb_enc_from_index(str_encindex);
3304 ptr_enc = rb_enc_from_index(ptr_encindex);
3305 goto incompatible;
3306 }
3307
3308 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3309 res_encindex = str_encindex;
3310 res_cr = ENC_CODERANGE_UNKNOWN;
3311 }
3312 else if (str_cr == ENC_CODERANGE_7BIT) {
3313 if (ptr_cr == ENC_CODERANGE_7BIT) {
3314 res_encindex = str_encindex;
3315 res_cr = ENC_CODERANGE_7BIT;
3316 }
3317 else {
3318 res_encindex = ptr_encindex;
3319 res_cr = ptr_cr;
3320 }
3321 }
3322 else if (str_cr == ENC_CODERANGE_VALID) {
3323 res_encindex = str_encindex;
3324 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3325 res_cr = str_cr;
3326 else
3327 res_cr = ptr_cr;
3328 }
3329 else { /* str_cr == ENC_CODERANGE_BROKEN */
3330 res_encindex = str_encindex;
3331 res_cr = str_cr;
3332 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3333 }
3334
3335 if (len < 0) {
3336 rb_raise(rb_eArgError, "negative string size (or size too big)");
3337 }
3338 str_buf_cat(str, ptr, len);
3339 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3340 return str;
3341
3342 incompatible:
3343 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3344 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3346}
3347
3348VALUE
3349rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3350{
3351 return rb_enc_cr_str_buf_cat(str, ptr, len,
3352 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3353}
3354
3355VALUE
3357{
3358 /* ptr must reference NUL terminated ASCII string. */
3359 int encindex = ENCODING_GET(str);
3360 rb_encoding *enc = rb_enc_from_index(encindex);
3361 if (rb_enc_asciicompat(enc)) {
3362 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3363 encindex, ENC_CODERANGE_7BIT, 0);
3364 }
3365 else {
3366 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3367 while (*ptr) {
3368 unsigned int c = (unsigned char)*ptr;
3369 int len = rb_enc_codelen(c, enc);
3370 rb_enc_mbcput(c, buf, enc);
3371 rb_enc_cr_str_buf_cat(str, buf, len,
3372 encindex, ENC_CODERANGE_VALID, 0);
3373 ptr++;
3374 }
3375 return str;
3376 }
3377}
3378
3379VALUE
3380rb_str_buf_append(VALUE str, VALUE str2)
3381{
3382 int str2_cr = rb_enc_str_coderange(str2);
3383
3384 if (str_enc_fastpath(str)) {
3385 switch (str2_cr) {
3386 case ENC_CODERANGE_7BIT:
3387 // If RHS is 7bit we can do simple concatenation
3388 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3389 RB_GC_GUARD(str2);
3390 return str;
3392 // If RHS is valid, we can do simple concatenation if encodings are the same
3393 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3394 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3395 int str_cr = ENC_CODERANGE(str);
3396 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3397 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3398 }
3399 RB_GC_GUARD(str2);
3400 return str;
3401 }
3402 }
3403 }
3404
3405 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3406 ENCODING_GET(str2), str2_cr, &str2_cr);
3407
3408 ENC_CODERANGE_SET(str2, str2_cr);
3409
3410 return str;
3411}
3412
3413VALUE
3415{
3416 StringValue(str2);
3417 return rb_str_buf_append(str, str2);
3418}
3419
3420VALUE
3421rb_str_concat_literals(size_t num, const VALUE *strary)
3422{
3423 VALUE str;
3424 size_t i, s = 0;
3425 unsigned long len = 1;
3426
3427 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3428 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3429
3430 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3431 str = rb_str_buf_new(len);
3432 str_enc_copy_direct(str, strary[0]);
3433
3434 for (i = s; i < num; ++i) {
3435 const VALUE v = strary[i];
3436 int encidx = ENCODING_GET(v);
3437
3438 rb_str_buf_append(str, v);
3439 if (encidx != ENCINDEX_US_ASCII) {
3440 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3441 rb_enc_set_index(str, encidx);
3442 }
3443 }
3444 return str;
3445}
3446
3447/*
3448 * call-seq:
3449 * concat(*objects) -> string
3450 *
3451 * Concatenates each object in +objects+ to +self+ and returns +self+:
3452 *
3453 * s = 'foo'
3454 * s.concat('bar', 'baz') # => "foobarbaz"
3455 * s # => "foobarbaz"
3456 *
3457 * For each given object +object+ that is an Integer,
3458 * the value is considered a codepoint and converted to a character before concatenation:
3459 *
3460 * s = 'foo'
3461 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3462 *
3463 * Related: String#<<, which takes a single argument.
3464 */
3465static VALUE
3466rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3467{
3468 str_modifiable(str);
3469
3470 if (argc == 1) {
3471 return rb_str_concat(str, argv[0]);
3472 }
3473 else if (argc > 1) {
3474 int i;
3475 VALUE arg_str = rb_str_tmp_new(0);
3476 rb_enc_copy(arg_str, str);
3477 for (i = 0; i < argc; i++) {
3478 rb_str_concat(arg_str, argv[i]);
3479 }
3480 rb_str_buf_append(str, arg_str);
3481 }
3482
3483 return str;
3484}
3485
3486/*
3487 * call-seq:
3488 * string << object -> string
3489 *
3490 * Concatenates +object+ to +self+ and returns +self+:
3491 *
3492 * s = 'foo'
3493 * s << 'bar' # => "foobar"
3494 * s # => "foobar"
3495 *
3496 * If +object+ is an Integer,
3497 * the value is considered a codepoint and converted to a character before concatenation:
3498 *
3499 * s = 'foo'
3500 * s << 33 # => "foo!"
3501 *
3502 * Related: String#concat, which takes multiple arguments.
3503 */
3504VALUE
3506{
3507 unsigned int code;
3508 rb_encoding *enc = STR_ENC_GET(str1);
3509 int encidx;
3510
3511 if (RB_INTEGER_TYPE_P(str2)) {
3512 if (rb_num_to_uint(str2, &code) == 0) {
3513 }
3514 else if (FIXNUM_P(str2)) {
3515 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3516 }
3517 else {
3518 rb_raise(rb_eRangeError, "bignum out of char range");
3519 }
3520 }
3521 else {
3522 return rb_str_append(str1, str2);
3523 }
3524
3525 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3526 if (encidx >= 0) {
3527 char buf[1];
3528 buf[0] = (char)code;
3529 rb_str_cat(str1, buf, 1);
3530 if (encidx != rb_enc_to_index(enc)) {
3531 rb_enc_associate_index(str1, encidx);
3533 }
3534 }
3535 else {
3536 long pos = RSTRING_LEN(str1);
3537 int cr = ENC_CODERANGE(str1);
3538 int len;
3539 char *buf;
3540
3541 switch (len = rb_enc_codelen(code, enc)) {
3542 case ONIGERR_INVALID_CODE_POINT_VALUE:
3543 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3544 break;
3545 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3546 case 0:
3547 rb_raise(rb_eRangeError, "%u out of char range", code);
3548 break;
3549 }
3550 buf = ALLOCA_N(char, len + 1);
3551 rb_enc_mbcput(code, buf, enc);
3552 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3553 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3554 }
3555 rb_str_resize(str1, pos+len);
3556 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3557 if (cr == ENC_CODERANGE_7BIT && code > 127) {
3559 }
3560 else if (cr == ENC_CODERANGE_BROKEN) {
3562 }
3563 ENC_CODERANGE_SET(str1, cr);
3564 }
3565 return str1;
3566}
3567
3568int
3569rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3570{
3571 int encidx = rb_enc_to_index(enc);
3572
3573 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3574 /* US-ASCII automatically extended to ASCII-8BIT */
3575 if (code > 0xFF) {
3576 rb_raise(rb_eRangeError, "%u out of char range", code);
3577 }
3578 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3579 return ENCINDEX_ASCII_8BIT;
3580 }
3581 return encidx;
3582 }
3583 else {
3584 return -1;
3585 }
3586}
3587
3588/*
3589 * call-seq:
3590 * prepend(*other_strings) -> string
3591 *
3592 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3593 *
3594 * s = 'foo'
3595 * s.prepend('bar', 'baz') # => "barbazfoo"
3596 * s # => "barbazfoo"
3597 *
3598 * Related: String#concat.
3599 */
3600
3601static VALUE
3602rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3603{
3604 str_modifiable(str);
3605
3606 if (argc == 1) {
3607 rb_str_update(str, 0L, 0L, argv[0]);
3608 }
3609 else if (argc > 1) {
3610 int i;
3611 VALUE arg_str = rb_str_tmp_new(0);
3612 rb_enc_copy(arg_str, str);
3613 for (i = 0; i < argc; i++) {
3614 rb_str_append(arg_str, argv[i]);
3615 }
3616 rb_str_update(str, 0L, 0L, arg_str);
3617 }
3618
3619 return str;
3620}
3621
3622st_index_t
3624{
3625 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3626 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3627 if (e && !is_ascii_string(str)) {
3628 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3629 }
3630 return h;
3631}
3632
3633int
3635{
3636 long len1, len2;
3637 const char *ptr1, *ptr2;
3638 RSTRING_GETMEM(str1, ptr1, len1);
3639 RSTRING_GETMEM(str2, ptr2, len2);
3640 return (len1 != len2 ||
3641 !rb_str_comparable(str1, str2) ||
3642 memcmp(ptr1, ptr2, len1) != 0);
3643}
3644
3645/*
3646 * call-seq:
3647 * hash -> integer
3648 *
3649 * Returns the integer hash value for +self+.
3650 * The value is based on the length, content and encoding of +self+.
3651 *
3652 * Related: Object#hash.
3653 */
3654
3655static VALUE
3656rb_str_hash_m(VALUE str)
3657{
3658 st_index_t hval = rb_str_hash(str);
3659 return ST2FIX(hval);
3660}
3661
3662#define lesser(a,b) (((a)>(b))?(b):(a))
3663
3664int
3666{
3667 int idx1, idx2;
3668 int rc1, rc2;
3669
3670 if (RSTRING_LEN(str1) == 0) return TRUE;
3671 if (RSTRING_LEN(str2) == 0) return TRUE;
3672 idx1 = ENCODING_GET(str1);
3673 idx2 = ENCODING_GET(str2);
3674 if (idx1 == idx2) return TRUE;
3675 rc1 = rb_enc_str_coderange(str1);
3676 rc2 = rb_enc_str_coderange(str2);
3677 if (rc1 == ENC_CODERANGE_7BIT) {
3678 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3679 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3680 return TRUE;
3681 }
3682 if (rc2 == ENC_CODERANGE_7BIT) {
3683 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3684 return TRUE;
3685 }
3686 return FALSE;
3687}
3688
3689int
3691{
3692 long len1, len2;
3693 const char *ptr1, *ptr2;
3694 int retval;
3695
3696 if (str1 == str2) return 0;
3697 RSTRING_GETMEM(str1, ptr1, len1);
3698 RSTRING_GETMEM(str2, ptr2, len2);
3699 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3700 if (len1 == len2) {
3701 if (!rb_str_comparable(str1, str2)) {
3702 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3703 return 1;
3704 return -1;
3705 }
3706 return 0;
3707 }
3708 if (len1 > len2) return 1;
3709 return -1;
3710 }
3711 if (retval > 0) return 1;
3712 return -1;
3713}
3714
3715/*
3716 * call-seq:
3717 * string == object -> true or false
3718 * string === object -> true or false
3719 *
3720 * Returns +true+ if +object+ has the same length and content;
3721 * as +self+; +false+ otherwise:
3722 *
3723 * s = 'foo'
3724 * s == 'foo' # => true
3725 * s == 'food' # => false
3726 * s == 'FOO' # => false
3727 *
3728 * Returns +false+ if the two strings' encodings are not compatible:
3729 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3730 *
3731 * If +object+ is not an instance of \String but responds to +to_str+, then the
3732 * two strings are compared using <code>object.==</code>.
3733 */
3734
3735VALUE
3737{
3738 if (str1 == str2) return Qtrue;
3739 if (!RB_TYPE_P(str2, T_STRING)) {
3740 if (!rb_respond_to(str2, idTo_str)) {
3741 return Qfalse;
3742 }
3743 return rb_equal(str2, str1);
3744 }
3745 return rb_str_eql_internal(str1, str2);
3746}
3747
3748/*
3749 * call-seq:
3750 * eql?(object) -> true or false
3751 *
3752 * Returns +true+ if +object+ has the same length and content;
3753 * as +self+; +false+ otherwise:
3754 *
3755 * s = 'foo'
3756 * s.eql?('foo') # => true
3757 * s.eql?('food') # => false
3758 * s.eql?('FOO') # => false
3759 *
3760 * Returns +false+ if the two strings' encodings are not compatible:
3761 *
3762 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3763 *
3764 */
3765
3766VALUE
3767rb_str_eql(VALUE str1, VALUE str2)
3768{
3769 if (str1 == str2) return Qtrue;
3770 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3771 return rb_str_eql_internal(str1, str2);
3772}
3773
3774/*
3775 * call-seq:
3776 * string <=> other_string -> -1, 0, 1, or nil
3777 *
3778 * Compares +self+ and +other_string+, returning:
3779 *
3780 * - -1 if +other_string+ is larger.
3781 * - 0 if the two are equal.
3782 * - 1 if +other_string+ is smaller.
3783 * - +nil+ if the two are incomparable.
3784 *
3785 * Examples:
3786 *
3787 * 'foo' <=> 'foo' # => 0
3788 * 'foo' <=> 'food' # => -1
3789 * 'food' <=> 'foo' # => 1
3790 * 'FOO' <=> 'foo' # => -1
3791 * 'foo' <=> 'FOO' # => 1
3792 * 'foo' <=> 1 # => nil
3793 *
3794 */
3795
3796static VALUE
3797rb_str_cmp_m(VALUE str1, VALUE str2)
3798{
3799 int result;
3800 VALUE s = rb_check_string_type(str2);
3801 if (NIL_P(s)) {
3802 return rb_invcmp(str1, str2);
3803 }
3804 result = rb_str_cmp(str1, s);
3805 return INT2FIX(result);
3806}
3807
3808static VALUE str_casecmp(VALUE str1, VALUE str2);
3809static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3810
3811/*
3812 * call-seq:
3813 * casecmp(other_string) -> -1, 0, 1, or nil
3814 *
3815 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3816 *
3817 * - -1 if <tt>other_string.downcase</tt> is larger.
3818 * - 0 if the two are equal.
3819 * - 1 if <tt>other_string.downcase</tt> is smaller.
3820 * - +nil+ if the two are incomparable.
3821 *
3822 * Examples:
3823 *
3824 * 'foo'.casecmp('foo') # => 0
3825 * 'foo'.casecmp('food') # => -1
3826 * 'food'.casecmp('foo') # => 1
3827 * 'FOO'.casecmp('foo') # => 0
3828 * 'foo'.casecmp('FOO') # => 0
3829 * 'foo'.casecmp(1) # => nil
3830 *
3831 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3832 *
3833 * Related: String#casecmp?.
3834 *
3835 */
3836
3837static VALUE
3838rb_str_casecmp(VALUE str1, VALUE str2)
3839{
3840 VALUE s = rb_check_string_type(str2);
3841 if (NIL_P(s)) {
3842 return Qnil;
3843 }
3844 return str_casecmp(str1, s);
3845}
3846
3847static VALUE
3848str_casecmp(VALUE str1, VALUE str2)
3849{
3850 long len;
3851 rb_encoding *enc;
3852 const char *p1, *p1end, *p2, *p2end;
3853
3854 enc = rb_enc_compatible(str1, str2);
3855 if (!enc) {
3856 return Qnil;
3857 }
3858
3859 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3860 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3861 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3862 while (p1 < p1end && p2 < p2end) {
3863 if (*p1 != *p2) {
3864 unsigned int c1 = TOLOWER(*p1 & 0xff);
3865 unsigned int c2 = TOLOWER(*p2 & 0xff);
3866 if (c1 != c2)
3867 return INT2FIX(c1 < c2 ? -1 : 1);
3868 }
3869 p1++;
3870 p2++;
3871 }
3872 }
3873 else {
3874 while (p1 < p1end && p2 < p2end) {
3875 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3876 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3877
3878 if (0 <= c1 && 0 <= c2) {
3879 c1 = TOLOWER(c1);
3880 c2 = TOLOWER(c2);
3881 if (c1 != c2)
3882 return INT2FIX(c1 < c2 ? -1 : 1);
3883 }
3884 else {
3885 int r;
3886 l1 = rb_enc_mbclen(p1, p1end, enc);
3887 l2 = rb_enc_mbclen(p2, p2end, enc);
3888 len = l1 < l2 ? l1 : l2;
3889 r = memcmp(p1, p2, len);
3890 if (r != 0)
3891 return INT2FIX(r < 0 ? -1 : 1);
3892 if (l1 != l2)
3893 return INT2FIX(l1 < l2 ? -1 : 1);
3894 }
3895 p1 += l1;
3896 p2 += l2;
3897 }
3898 }
3899 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3900 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3901 return INT2FIX(-1);
3902}
3903
3904/*
3905 * call-seq:
3906 * casecmp?(other_string) -> true, false, or nil
3907 *
3908 * Returns +true+ if +self+ and +other_string+ are equal after
3909 * Unicode case folding, otherwise +false+:
3910 *
3911 * 'foo'.casecmp?('foo') # => true
3912 * 'foo'.casecmp?('food') # => false
3913 * 'food'.casecmp?('foo') # => false
3914 * 'FOO'.casecmp?('foo') # => true
3915 * 'foo'.casecmp?('FOO') # => true
3916 *
3917 * Returns +nil+ if the two values are incomparable:
3918 *
3919 * 'foo'.casecmp?(1) # => nil
3920 *
3921 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3922 *
3923 * Related: String#casecmp.
3924 *
3925 */
3926
3927static VALUE
3928rb_str_casecmp_p(VALUE str1, VALUE str2)
3929{
3930 VALUE s = rb_check_string_type(str2);
3931 if (NIL_P(s)) {
3932 return Qnil;
3933 }
3934 return str_casecmp_p(str1, s);
3935}
3936
3937static VALUE
3938str_casecmp_p(VALUE str1, VALUE str2)
3939{
3940 rb_encoding *enc;
3941 VALUE folded_str1, folded_str2;
3942 VALUE fold_opt = sym_fold;
3943
3944 enc = rb_enc_compatible(str1, str2);
3945 if (!enc) {
3946 return Qnil;
3947 }
3948
3949 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3950 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3951
3952 return rb_str_eql(folded_str1, folded_str2);
3953}
3954
3955static long
3956strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3957 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3958{
3959 const char *search_start = str_ptr;
3960 long pos, search_len = str_len - offset;
3961
3962 for (;;) {
3963 const char *t;
3964 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3965 if (pos < 0) return pos;
3966 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3967 if (t == search_start + pos) break;
3968 search_len -= t - search_start;
3969 if (search_len <= 0) return -1;
3970 offset += t - search_start;
3971 search_start = t;
3972 }
3973 return pos + offset;
3974}
3975
3976/* found index in byte */
3977#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3978#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
3979
3980static long
3981rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3982{
3983 const char *str_ptr, *str_ptr_end, *sub_ptr;
3984 long str_len, sub_len;
3985 rb_encoding *enc;
3986
3987 enc = rb_enc_check(str, sub);
3988 if (is_broken_string(sub)) return -1;
3989
3990 str_ptr = RSTRING_PTR(str);
3991 str_ptr_end = RSTRING_END(str);
3992 str_len = RSTRING_LEN(str);
3993 sub_ptr = RSTRING_PTR(sub);
3994 sub_len = RSTRING_LEN(sub);
3995
3996 if (str_len < sub_len) return -1;
3997
3998 if (offset != 0) {
3999 long str_len_char, sub_len_char;
4000 int single_byte = single_byte_optimizable(str);
4001 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
4002 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
4003 if (offset < 0) {
4004 offset += str_len_char;
4005 if (offset < 0) return -1;
4006 }
4007 if (str_len_char - offset < sub_len_char) return -1;
4008 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
4009 str_ptr += offset;
4010 }
4011 if (sub_len == 0) return offset;
4012
4013 /* need proceed one character at a time */
4014 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
4015}
4016
4017
4018/*
4019 * call-seq:
4020 * index(substring, offset = 0) -> integer or nil
4021 * index(regexp, offset = 0) -> integer or nil
4022 *
4023 * :include: doc/string/index.rdoc
4024 *
4025 */
4026
4027static VALUE
4028rb_str_index_m(int argc, VALUE *argv, VALUE str)
4029{
4030 VALUE sub;
4031 VALUE initpos;
4032 rb_encoding *enc = STR_ENC_GET(str);
4033 long pos;
4034
4035 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4036 long slen = str_strlen(str, enc); /* str's enc */
4037 pos = NUM2LONG(initpos);
4038 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4039 if (RB_TYPE_P(sub, T_REGEXP)) {
4041 }
4042 return Qnil;
4043 }
4044 }
4045 else {
4046 pos = 0;
4047 }
4048
4049 if (RB_TYPE_P(sub, T_REGEXP)) {
4050 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4051 enc, single_byte_optimizable(str));
4052
4053 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4054 VALUE match = rb_backref_get();
4055 struct re_registers *regs = RMATCH_REGS(match);
4056 pos = rb_str_sublen(str, BEG(0));
4057 return LONG2NUM(pos);
4058 }
4059 }
4060 else {
4061 StringValue(sub);
4062 pos = rb_str_index(str, sub, pos);
4063 if (pos >= 0) {
4064 pos = rb_str_sublen(str, pos);
4065 return LONG2NUM(pos);
4066 }
4067 }
4068 return Qnil;
4069}
4070
4071/* Ensure that the given pos is a valid character boundary.
4072 * Note that in this function, "character" means a code point
4073 * (Unicode scalar value), not a grapheme cluster.
4074 */
4075static void
4076str_ensure_byte_pos(VALUE str, long pos)
4077{
4078 const char *s = RSTRING_PTR(str);
4079 const char *e = RSTRING_END(str);
4080 const char *p = s + pos;
4081 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4082 rb_raise(rb_eIndexError,
4083 "offset %ld does not land on character boundary", pos);
4084 }
4085}
4086
4087/*
4088 * call-seq:
4089 * byteindex(substring, offset = 0) -> integer or nil
4090 * byteindex(regexp, offset = 0) -> integer or nil
4091 *
4092 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4093 * or +nil+ if none found:
4094 *
4095 * 'foo'.byteindex('f') # => 0
4096 * 'foo'.byteindex('o') # => 1
4097 * 'foo'.byteindex('oo') # => 1
4098 * 'foo'.byteindex('ooo') # => nil
4099 *
4100 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4101 * or +nil+ if none found:
4102 *
4103 * 'foo'.byteindex(/f/) # => 0
4104 * 'foo'.byteindex(/o/) # => 1
4105 * 'foo'.byteindex(/oo/) # => 1
4106 * 'foo'.byteindex(/ooo/) # => nil
4107 *
4108 * Integer argument +offset+, if given, specifies the byte-based position in the
4109 * string to begin the search:
4110 *
4111 * 'foo'.byteindex('o', 1) # => 1
4112 * 'foo'.byteindex('o', 2) # => 2
4113 * 'foo'.byteindex('o', 3) # => nil
4114 *
4115 * If +offset+ is negative, counts backward from the end of +self+:
4116 *
4117 * 'foo'.byteindex('o', -1) # => 2
4118 * 'foo'.byteindex('o', -2) # => 1
4119 * 'foo'.byteindex('o', -3) # => 1
4120 * 'foo'.byteindex('o', -4) # => nil
4121 *
4122 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4123 * raised.
4124 *
4125 * Related: String#index, String#byterindex.
4126 */
4127
4128static VALUE
4129rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4130{
4131 VALUE sub;
4132 VALUE initpos;
4133 long pos;
4134
4135 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4136 long slen = RSTRING_LEN(str);
4137 pos = NUM2LONG(initpos);
4138 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4139 if (RB_TYPE_P(sub, T_REGEXP)) {
4141 }
4142 return Qnil;
4143 }
4144 }
4145 else {
4146 pos = 0;
4147 }
4148
4149 str_ensure_byte_pos(str, pos);
4150
4151 if (RB_TYPE_P(sub, T_REGEXP)) {
4152 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4153 VALUE match = rb_backref_get();
4154 struct re_registers *regs = RMATCH_REGS(match);
4155 pos = BEG(0);
4156 return LONG2NUM(pos);
4157 }
4158 }
4159 else {
4160 StringValue(sub);
4161 pos = rb_str_byteindex(str, sub, pos);
4162 if (pos >= 0) return LONG2NUM(pos);
4163 }
4164 return Qnil;
4165}
4166
4167#ifdef HAVE_MEMRCHR
4168static long
4169str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4170{
4171 char *hit, *adjusted;
4172 int c;
4173 long slen, searchlen;
4174 char *sbeg, *e, *t;
4175
4176 sbeg = RSTRING_PTR(str);
4177 slen = RSTRING_LEN(sub);
4178 if (slen == 0) return s - sbeg;
4179 e = RSTRING_END(str);
4180 t = RSTRING_PTR(sub);
4181 c = *t & 0xff;
4182 searchlen = s - sbeg + 1;
4183
4184 do {
4185 hit = memrchr(sbeg, c, searchlen);
4186 if (!hit) break;
4187 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4188 if (hit != adjusted) {
4189 searchlen = adjusted - sbeg;
4190 continue;
4191 }
4192 if (memcmp(hit, t, slen) == 0)
4193 return hit - sbeg;
4194 searchlen = adjusted - sbeg;
4195 } while (searchlen > 0);
4196
4197 return -1;
4198}
4199#else
4200static long
4201str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4202{
4203 long slen;
4204 char *sbeg, *e, *t;
4205
4206 sbeg = RSTRING_PTR(str);
4207 e = RSTRING_END(str);
4208 t = RSTRING_PTR(sub);
4209 slen = RSTRING_LEN(sub);
4210
4211 while (s) {
4212 if (memcmp(s, t, slen) == 0) {
4213 return s - sbeg;
4214 }
4215 if (s <= sbeg) break;
4216 s = rb_enc_prev_char(sbeg, s, e, enc);
4217 }
4218
4219 return -1;
4220}
4221#endif
4222
4223/* found index in byte */
4224static long
4225rb_str_rindex(VALUE str, VALUE sub, long pos)
4226{
4227 long len, slen;
4228 char *sbeg, *s;
4229 rb_encoding *enc;
4230 int singlebyte;
4231
4232 enc = rb_enc_check(str, sub);
4233 if (is_broken_string(sub)) return -1;
4234 singlebyte = single_byte_optimizable(str);
4235 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4236 slen = str_strlen(sub, enc); /* rb_enc_check */
4237
4238 /* substring longer than string */
4239 if (len < slen) return -1;
4240 if (len - pos < slen) pos = len - slen;
4241 if (len == 0) return pos;
4242
4243 sbeg = RSTRING_PTR(str);
4244
4245 if (pos == 0) {
4246 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4247 return 0;
4248 else
4249 return -1;
4250 }
4251
4252 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4253 return str_rindex(str, sub, s, enc);
4254}
4255
4256/*
4257 * call-seq:
4258 * rindex(substring, offset = self.length) -> integer or nil
4259 * rindex(regexp, offset = self.length) -> integer or nil
4260 *
4261 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4262 * or +nil+ if none found:
4263 *
4264 * 'foo'.rindex('f') # => 0
4265 * 'foo'.rindex('o') # => 2
4266 * 'foo'.rindex('oo') # => 1
4267 * 'foo'.rindex('ooo') # => nil
4268 *
4269 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4270 * or +nil+ if none found:
4271 *
4272 * 'foo'.rindex(/f/) # => 0
4273 * 'foo'.rindex(/o/) # => 2
4274 * 'foo'.rindex(/oo/) # => 1
4275 * 'foo'.rindex(/ooo/) # => nil
4276 *
4277 * The _last_ match means starting at the possible last position, not
4278 * the last of longest matches.
4279 *
4280 * 'foo'.rindex(/o+/) # => 2
4281 * $~ #=> #<MatchData "o">
4282 *
4283 * To get the last longest match, needs to combine with negative
4284 * lookbehind.
4285 *
4286 * 'foo'.rindex(/(?<!o)o+/) # => 1
4287 * $~ #=> #<MatchData "oo">
4288 *
4289 * Or String#index with negative lookforward.
4290 *
4291 * 'foo'.index(/o+(?!.*o)/) # => 1
4292 * $~ #=> #<MatchData "oo">
4293 *
4294 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4295 * string to _end_ the search:
4296 *
4297 * 'foo'.rindex('o', 0) # => nil
4298 * 'foo'.rindex('o', 1) # => 1
4299 * 'foo'.rindex('o', 2) # => 2
4300 * 'foo'.rindex('o', 3) # => 2
4301 *
4302 * If +offset+ is a negative Integer, the maximum starting position in the
4303 * string to _end_ the search is the sum of the string's length and +offset+:
4304 *
4305 * 'foo'.rindex('o', -1) # => 2
4306 * 'foo'.rindex('o', -2) # => 1
4307 * 'foo'.rindex('o', -3) # => nil
4308 * 'foo'.rindex('o', -4) # => nil
4309 *
4310 * Related: String#index.
4311 */
4312
4313static VALUE
4314rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4315{
4316 VALUE sub;
4317 VALUE initpos;
4318 rb_encoding *enc = STR_ENC_GET(str);
4319 long pos, len = str_strlen(str, enc); /* str's enc */
4320
4321 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4322 pos = NUM2LONG(initpos);
4323 if (pos < 0 && (pos += len) < 0) {
4324 if (RB_TYPE_P(sub, T_REGEXP)) {
4326 }
4327 return Qnil;
4328 }
4329 if (pos > len) pos = len;
4330 }
4331 else {
4332 pos = len;
4333 }
4334
4335 if (RB_TYPE_P(sub, T_REGEXP)) {
4336 /* enc = rb_enc_check(str, sub); */
4337 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4338 enc, single_byte_optimizable(str));
4339
4340 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4341 VALUE match = rb_backref_get();
4342 struct re_registers *regs = RMATCH_REGS(match);
4343 pos = rb_str_sublen(str, BEG(0));
4344 return LONG2NUM(pos);
4345 }
4346 }
4347 else {
4348 StringValue(sub);
4349 pos = rb_str_rindex(str, sub, pos);
4350 if (pos >= 0) {
4351 pos = rb_str_sublen(str, pos);
4352 return LONG2NUM(pos);
4353 }
4354 }
4355 return Qnil;
4356}
4357
4358static long
4359rb_str_byterindex(VALUE str, VALUE sub, long pos)
4360{
4361 long len, slen;
4362 char *sbeg, *s;
4363 rb_encoding *enc;
4364
4365 enc = rb_enc_check(str, sub);
4366 if (is_broken_string(sub)) return -1;
4367 len = RSTRING_LEN(str);
4368 slen = RSTRING_LEN(sub);
4369
4370 /* substring longer than string */
4371 if (len < slen) return -1;
4372 if (len - pos < slen) pos = len - slen;
4373 if (len == 0) return pos;
4374
4375 sbeg = RSTRING_PTR(str);
4376
4377 if (pos == 0) {
4378 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4379 return 0;
4380 else
4381 return -1;
4382 }
4383
4384 s = sbeg + pos;
4385 return str_rindex(str, sub, s, enc);
4386}
4387
4388
4389/*
4390 * call-seq:
4391 * byterindex(substring, offset = self.bytesize) -> integer or nil
4392 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4393 *
4394 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4395 * or +nil+ if none found:
4396 *
4397 * 'foo'.byterindex('f') # => 0
4398 * 'foo'.byterindex('o') # => 2
4399 * 'foo'.byterindex('oo') # => 1
4400 * 'foo'.byterindex('ooo') # => nil
4401 *
4402 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4403 * or +nil+ if none found:
4404 *
4405 * 'foo'.byterindex(/f/) # => 0
4406 * 'foo'.byterindex(/o/) # => 2
4407 * 'foo'.byterindex(/oo/) # => 1
4408 * 'foo'.byterindex(/ooo/) # => nil
4409 *
4410 * The _last_ match means starting at the possible last position, not
4411 * the last of longest matches.
4412 *
4413 * 'foo'.byterindex(/o+/) # => 2
4414 * $~ #=> #<MatchData "o">
4415 *
4416 * To get the last longest match, needs to combine with negative
4417 * lookbehind.
4418 *
4419 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4420 * $~ #=> #<MatchData "oo">
4421 *
4422 * Or String#byteindex with negative lookforward.
4423 *
4424 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4425 * $~ #=> #<MatchData "oo">
4426 *
4427 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4428 * string to _end_ the search:
4429 *
4430 * 'foo'.byterindex('o', 0) # => nil
4431 * 'foo'.byterindex('o', 1) # => 1
4432 * 'foo'.byterindex('o', 2) # => 2
4433 * 'foo'.byterindex('o', 3) # => 2
4434 *
4435 * If +offset+ is a negative Integer, the maximum starting position in the
4436 * string to _end_ the search is the sum of the string's length and +offset+:
4437 *
4438 * 'foo'.byterindex('o', -1) # => 2
4439 * 'foo'.byterindex('o', -2) # => 1
4440 * 'foo'.byterindex('o', -3) # => nil
4441 * 'foo'.byterindex('o', -4) # => nil
4442 *
4443 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4444 * raised.
4445 *
4446 * Related: String#byteindex.
4447 */
4448
4449static VALUE
4450rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4451{
4452 VALUE sub;
4453 VALUE initpos;
4454 long pos, len = RSTRING_LEN(str);
4455
4456 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4457 pos = NUM2LONG(initpos);
4458 if (pos < 0 && (pos += len) < 0) {
4459 if (RB_TYPE_P(sub, T_REGEXP)) {
4461 }
4462 return Qnil;
4463 }
4464 if (pos > len) pos = len;
4465 }
4466 else {
4467 pos = len;
4468 }
4469
4470 str_ensure_byte_pos(str, pos);
4471
4472 if (RB_TYPE_P(sub, T_REGEXP)) {
4473 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4474 VALUE match = rb_backref_get();
4475 struct re_registers *regs = RMATCH_REGS(match);
4476 pos = BEG(0);
4477 return LONG2NUM(pos);
4478 }
4479 }
4480 else {
4481 StringValue(sub);
4482 pos = rb_str_byterindex(str, sub, pos);
4483 if (pos >= 0) return LONG2NUM(pos);
4484 }
4485 return Qnil;
4486}
4487
4488/*
4489 * call-seq:
4490 * string =~ regexp -> integer or nil
4491 * string =~ object -> integer or nil
4492 *
4493 * Returns the Integer index of the first substring that matches
4494 * the given +regexp+, or +nil+ if no match found:
4495 *
4496 * 'foo' =~ /f/ # => 0
4497 * 'foo' =~ /o/ # => 1
4498 * 'foo' =~ /x/ # => nil
4499 *
4500 * Note: also updates Regexp@Global+Variables.
4501 *
4502 * If the given +object+ is not a Regexp, returns the value
4503 * returned by <tt>object =~ self</tt>.
4504 *
4505 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4506 * (see Regexp#=~):
4507 *
4508 * number= nil
4509 * "no. 9" =~ /(?<number>\d+)/
4510 * number # => nil (not assigned)
4511 * /(?<number>\d+)/ =~ "no. 9"
4512 * number #=> "9"
4513 *
4514 */
4515
4516static VALUE
4517rb_str_match(VALUE x, VALUE y)
4518{
4519 switch (OBJ_BUILTIN_TYPE(y)) {
4520 case T_STRING:
4521 rb_raise(rb_eTypeError, "type mismatch: String given");
4522
4523 case T_REGEXP:
4524 return rb_reg_match(y, x);
4525
4526 default:
4527 return rb_funcall(y, idEqTilde, 1, x);
4528 }
4529}
4530
4531
4532static VALUE get_pat(VALUE);
4533
4534
4535/*
4536 * call-seq:
4537 * match(pattern, offset = 0) -> matchdata or nil
4538 * match(pattern, offset = 0) {|matchdata| ... } -> object
4539 *
4540 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4541 *
4542 * Note: also updates Regexp@Global+Variables.
4543 *
4544 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4545 * regexp = Regexp.new(pattern)
4546 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4547 * (see Regexp#match):
4548 * matchdata = <tt>regexp.match(self)
4549 *
4550 * With no block given, returns the computed +matchdata+:
4551 *
4552 * 'foo'.match('f') # => #<MatchData "f">
4553 * 'foo'.match('o') # => #<MatchData "o">
4554 * 'foo'.match('x') # => nil
4555 *
4556 * If Integer argument +offset+ is given, the search begins at index +offset+:
4557 *
4558 * 'foo'.match('f', 1) # => nil
4559 * 'foo'.match('o', 1) # => #<MatchData "o">
4560 *
4561 * With a block given, calls the block with the computed +matchdata+
4562 * and returns the block's return value:
4563 *
4564 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4565 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4566 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4567 *
4568 */
4569
4570static VALUE
4571rb_str_match_m(int argc, VALUE *argv, VALUE str)
4572{
4573 VALUE re, result;
4574 if (argc < 1)
4575 rb_check_arity(argc, 1, 2);
4576 re = argv[0];
4577 argv[0] = str;
4578 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4579 if (!NIL_P(result) && rb_block_given_p()) {
4580 return rb_yield(result);
4581 }
4582 return result;
4583}
4584
4585/*
4586 * call-seq:
4587 * match?(pattern, offset = 0) -> true or false
4588 *
4589 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4590 *
4591 * Note: does not update Regexp@Global+Variables.
4592 *
4593 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4594 * regexp = Regexp.new(pattern)
4595 *
4596 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4597 * +false+ otherwise:
4598 *
4599 * 'foo'.match?(/o/) # => true
4600 * 'foo'.match?('o') # => true
4601 * 'foo'.match?(/x/) # => false
4602 *
4603 * If Integer argument +offset+ is given, the search begins at index +offset+:
4604 * 'foo'.match?('f', 1) # => false
4605 * 'foo'.match?('o', 1) # => true
4606 *
4607 */
4608
4609static VALUE
4610rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4611{
4612 VALUE re;
4613 rb_check_arity(argc, 1, 2);
4614 re = get_pat(argv[0]);
4615 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4616}
4617
4618enum neighbor_char {
4619 NEIGHBOR_NOT_CHAR,
4620 NEIGHBOR_FOUND,
4621 NEIGHBOR_WRAPPED
4622};
4623
4624static enum neighbor_char
4625enc_succ_char(char *p, long len, rb_encoding *enc)
4626{
4627 long i;
4628 int l;
4629
4630 if (rb_enc_mbminlen(enc) > 1) {
4631 /* wchar, trivial case */
4632 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4633 if (!MBCLEN_CHARFOUND_P(r)) {
4634 return NEIGHBOR_NOT_CHAR;
4635 }
4636 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4637 l = rb_enc_code_to_mbclen(c, enc);
4638 if (!l) return NEIGHBOR_NOT_CHAR;
4639 if (l != len) return NEIGHBOR_WRAPPED;
4640 rb_enc_mbcput(c, p, enc);
4641 r = rb_enc_precise_mbclen(p, p + len, enc);
4642 if (!MBCLEN_CHARFOUND_P(r)) {
4643 return NEIGHBOR_NOT_CHAR;
4644 }
4645 return NEIGHBOR_FOUND;
4646 }
4647 while (1) {
4648 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4649 p[i] = '\0';
4650 if (i < 0)
4651 return NEIGHBOR_WRAPPED;
4652 ++((unsigned char*)p)[i];
4653 l = rb_enc_precise_mbclen(p, p+len, enc);
4654 if (MBCLEN_CHARFOUND_P(l)) {
4655 l = MBCLEN_CHARFOUND_LEN(l);
4656 if (l == len) {
4657 return NEIGHBOR_FOUND;
4658 }
4659 else {
4660 memset(p+l, 0xff, len-l);
4661 }
4662 }
4663 if (MBCLEN_INVALID_P(l) && i < len-1) {
4664 long len2;
4665 int l2;
4666 for (len2 = len-1; 0 < len2; len2--) {
4667 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4668 if (!MBCLEN_INVALID_P(l2))
4669 break;
4670 }
4671 memset(p+len2+1, 0xff, len-(len2+1));
4672 }
4673 }
4674}
4675
4676static enum neighbor_char
4677enc_pred_char(char *p, long len, rb_encoding *enc)
4678{
4679 long i;
4680 int l;
4681 if (rb_enc_mbminlen(enc) > 1) {
4682 /* wchar, trivial case */
4683 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4684 if (!MBCLEN_CHARFOUND_P(r)) {
4685 return NEIGHBOR_NOT_CHAR;
4686 }
4687 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4688 if (!c) return NEIGHBOR_NOT_CHAR;
4689 --c;
4690 l = rb_enc_code_to_mbclen(c, enc);
4691 if (!l) return NEIGHBOR_NOT_CHAR;
4692 if (l != len) return NEIGHBOR_WRAPPED;
4693 rb_enc_mbcput(c, p, enc);
4694 r = rb_enc_precise_mbclen(p, p + len, enc);
4695 if (!MBCLEN_CHARFOUND_P(r)) {
4696 return NEIGHBOR_NOT_CHAR;
4697 }
4698 return NEIGHBOR_FOUND;
4699 }
4700 while (1) {
4701 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4702 p[i] = '\xff';
4703 if (i < 0)
4704 return NEIGHBOR_WRAPPED;
4705 --((unsigned char*)p)[i];
4706 l = rb_enc_precise_mbclen(p, p+len, enc);
4707 if (MBCLEN_CHARFOUND_P(l)) {
4708 l = MBCLEN_CHARFOUND_LEN(l);
4709 if (l == len) {
4710 return NEIGHBOR_FOUND;
4711 }
4712 else {
4713 memset(p+l, 0, len-l);
4714 }
4715 }
4716 if (MBCLEN_INVALID_P(l) && i < len-1) {
4717 long len2;
4718 int l2;
4719 for (len2 = len-1; 0 < len2; len2--) {
4720 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4721 if (!MBCLEN_INVALID_P(l2))
4722 break;
4723 }
4724 memset(p+len2+1, 0, len-(len2+1));
4725 }
4726 }
4727}
4728
4729/*
4730 overwrite +p+ by succeeding letter in +enc+ and returns
4731 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4732 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4733 assuming each ranges are successive, and mbclen
4734 never change in each ranges.
4735 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4736 character.
4737 */
4738static enum neighbor_char
4739enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4740{
4741 enum neighbor_char ret;
4742 unsigned int c;
4743 int ctype;
4744 int range;
4745 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4746
4747 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4748 int try;
4749 const int max_gaps = 1;
4750
4751 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4752 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4753 ctype = ONIGENC_CTYPE_DIGIT;
4754 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4755 ctype = ONIGENC_CTYPE_ALPHA;
4756 else
4757 return NEIGHBOR_NOT_CHAR;
4758
4759 MEMCPY(save, p, char, len);
4760 for (try = 0; try <= max_gaps; ++try) {
4761 ret = enc_succ_char(p, len, enc);
4762 if (ret == NEIGHBOR_FOUND) {
4763 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4764 if (rb_enc_isctype(c, ctype, enc))
4765 return NEIGHBOR_FOUND;
4766 }
4767 }
4768 MEMCPY(p, save, char, len);
4769 range = 1;
4770 while (1) {
4771 MEMCPY(save, p, char, len);
4772 ret = enc_pred_char(p, len, enc);
4773 if (ret == NEIGHBOR_FOUND) {
4774 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4775 if (!rb_enc_isctype(c, ctype, enc)) {
4776 MEMCPY(p, save, char, len);
4777 break;
4778 }
4779 }
4780 else {
4781 MEMCPY(p, save, char, len);
4782 break;
4783 }
4784 range++;
4785 }
4786 if (range == 1) {
4787 return NEIGHBOR_NOT_CHAR;
4788 }
4789
4790 if (ctype != ONIGENC_CTYPE_DIGIT) {
4791 MEMCPY(carry, p, char, len);
4792 return NEIGHBOR_WRAPPED;
4793 }
4794
4795 MEMCPY(carry, p, char, len);
4796 enc_succ_char(carry, len, enc);
4797 return NEIGHBOR_WRAPPED;
4798}
4799
4800
4801static VALUE str_succ(VALUE str);
4802
4803/*
4804 * call-seq:
4805 * succ -> new_str
4806 *
4807 * Returns the successor to +self+. The successor is calculated by
4808 * incrementing characters.
4809 *
4810 * The first character to be incremented is the rightmost alphanumeric:
4811 * or, if no alphanumerics, the rightmost character:
4812 *
4813 * 'THX1138'.succ # => "THX1139"
4814 * '<<koala>>'.succ # => "<<koalb>>"
4815 * '***'.succ # => '**+'
4816 *
4817 * The successor to a digit is another digit, "carrying" to the next-left
4818 * character for a "rollover" from 9 to 0, and prepending another digit
4819 * if necessary:
4820 *
4821 * '00'.succ # => "01"
4822 * '09'.succ # => "10"
4823 * '99'.succ # => "100"
4824 *
4825 * The successor to a letter is another letter of the same case,
4826 * carrying to the next-left character for a rollover,
4827 * and prepending another same-case letter if necessary:
4828 *
4829 * 'aa'.succ # => "ab"
4830 * 'az'.succ # => "ba"
4831 * 'zz'.succ # => "aaa"
4832 * 'AA'.succ # => "AB"
4833 * 'AZ'.succ # => "BA"
4834 * 'ZZ'.succ # => "AAA"
4835 *
4836 * The successor to a non-alphanumeric character is the next character
4837 * in the underlying character set's collating sequence,
4838 * carrying to the next-left character for a rollover,
4839 * and prepending another character if necessary:
4840 *
4841 * s = 0.chr * 3
4842 * s # => "\x00\x00\x00"
4843 * s.succ # => "\x00\x00\x01"
4844 * s = 255.chr * 3
4845 * s # => "\xFF\xFF\xFF"
4846 * s.succ # => "\x01\x00\x00\x00"
4847 *
4848 * Carrying can occur between and among mixtures of alphanumeric characters:
4849 *
4850 * s = 'zz99zz99'
4851 * s.succ # => "aaa00aa00"
4852 * s = '99zz99zz'
4853 * s.succ # => "100aa00aa"
4854 *
4855 * The successor to an empty \String is a new empty \String:
4856 *
4857 * ''.succ # => ""
4858 *
4859 */
4860
4861VALUE
4863{
4864 VALUE str;
4865 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4866 rb_enc_cr_str_copy_for_substr(str, orig);
4867 return str_succ(str);
4868}
4869
4870static VALUE
4871str_succ(VALUE str)
4872{
4873 rb_encoding *enc;
4874 char *sbeg, *s, *e, *last_alnum = 0;
4875 int found_alnum = 0;
4876 long l, slen;
4877 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4878 long carry_pos = 0, carry_len = 1;
4879 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4880
4881 slen = RSTRING_LEN(str);
4882 if (slen == 0) return str;
4883
4884 enc = STR_ENC_GET(str);
4885 sbeg = RSTRING_PTR(str);
4886 s = e = sbeg + slen;
4887
4888 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4889 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4890 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4891 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4892 break;
4893 }
4894 }
4895 l = rb_enc_precise_mbclen(s, e, enc);
4896 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4897 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4898 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4899 switch (neighbor) {
4900 case NEIGHBOR_NOT_CHAR:
4901 continue;
4902 case NEIGHBOR_FOUND:
4903 return str;
4904 case NEIGHBOR_WRAPPED:
4905 last_alnum = s;
4906 break;
4907 }
4908 found_alnum = 1;
4909 carry_pos = s - sbeg;
4910 carry_len = l;
4911 }
4912 if (!found_alnum) { /* str contains no alnum */
4913 s = e;
4914 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4915 enum neighbor_char neighbor;
4916 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4917 l = rb_enc_precise_mbclen(s, e, enc);
4918 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4919 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4920 MEMCPY(tmp, s, char, l);
4921 neighbor = enc_succ_char(tmp, l, enc);
4922 switch (neighbor) {
4923 case NEIGHBOR_FOUND:
4924 MEMCPY(s, tmp, char, l);
4925 return str;
4926 break;
4927 case NEIGHBOR_WRAPPED:
4928 MEMCPY(s, tmp, char, l);
4929 break;
4930 case NEIGHBOR_NOT_CHAR:
4931 break;
4932 }
4933 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4934 /* wrapped to \0...\0. search next valid char. */
4935 enc_succ_char(s, l, enc);
4936 }
4937 if (!rb_enc_asciicompat(enc)) {
4938 MEMCPY(carry, s, char, l);
4939 carry_len = l;
4940 }
4941 carry_pos = s - sbeg;
4942 }
4944 }
4945 RESIZE_CAPA(str, slen + carry_len);
4946 sbeg = RSTRING_PTR(str);
4947 s = sbeg + carry_pos;
4948 memmove(s + carry_len, s, slen - carry_pos);
4949 memmove(s, carry, carry_len);
4950 slen += carry_len;
4951 STR_SET_LEN(str, slen);
4952 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4954 return str;
4955}
4956
4957
4958/*
4959 * call-seq:
4960 * succ! -> self
4961 *
4962 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4963 */
4964
4965static VALUE
4966rb_str_succ_bang(VALUE str)
4967{
4968 rb_str_modify(str);
4969 str_succ(str);
4970 return str;
4971}
4972
4973static int
4974all_digits_p(const char *s, long len)
4975{
4976 while (len-- > 0) {
4977 if (!ISDIGIT(*s)) return 0;
4978 s++;
4979 }
4980 return 1;
4981}
4982
4983static int
4984str_upto_i(VALUE str, VALUE arg)
4985{
4986 rb_yield(str);
4987 return 0;
4988}
4989
4990/*
4991 * call-seq:
4992 * upto(other_string, exclusive = false) {|string| ... } -> self
4993 * upto(other_string, exclusive = false) -> new_enumerator
4994 *
4995 * With a block given, calls the block with each \String value
4996 * returned by successive calls to String#succ;
4997 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4998 * the sequence terminates when value +other_string+ is reached;
4999 * returns +self+:
5000 *
5001 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
5002 * Output:
5003 *
5004 * a8 a9 b0 b1 b2 b3 b4 b5 b6
5005 *
5006 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
5007 *
5008 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
5009 *
5010 * Output:
5011 *
5012 * a8 a9 b0 b1 b2 b3 b4 b5
5013 *
5014 * If +other_string+ would not be reached, does not call the block:
5015 *
5016 * '25'.upto('5') {|s| fail s }
5017 * 'aa'.upto('a') {|s| fail s }
5018 *
5019 * With no block given, returns a new Enumerator:
5020 *
5021 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
5022 *
5023 */
5024
5025static VALUE
5026rb_str_upto(int argc, VALUE *argv, VALUE beg)
5027{
5028 VALUE end, exclusive;
5029
5030 rb_scan_args(argc, argv, "11", &end, &exclusive);
5031 RETURN_ENUMERATOR(beg, argc, argv);
5032 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
5033}
5034
5035VALUE
5036rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5037{
5038 VALUE current, after_end;
5039 ID succ;
5040 int n, ascii;
5041 rb_encoding *enc;
5042
5043 CONST_ID(succ, "succ");
5044 StringValue(end);
5045 enc = rb_enc_check(beg, end);
5046 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5047 /* single character */
5048 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5049 char c = RSTRING_PTR(beg)[0];
5050 char e = RSTRING_PTR(end)[0];
5051
5052 if (c > e || (excl && c == e)) return beg;
5053 for (;;) {
5054 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5055 if (!excl && c == e) break;
5056 c++;
5057 if (excl && c == e) break;
5058 }
5059 return beg;
5060 }
5061 /* both edges are all digits */
5062 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5063 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5064 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5065 VALUE b, e;
5066 int width;
5067
5068 width = RSTRING_LENINT(beg);
5069 b = rb_str_to_inum(beg, 10, FALSE);
5070 e = rb_str_to_inum(end, 10, FALSE);
5071 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5072 long bi = FIX2LONG(b);
5073 long ei = FIX2LONG(e);
5074 rb_encoding *usascii = rb_usascii_encoding();
5075
5076 while (bi <= ei) {
5077 if (excl && bi == ei) break;
5078 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5079 bi++;
5080 }
5081 }
5082 else {
5083 ID op = excl ? '<' : idLE;
5084 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5085
5086 args[0] = INT2FIX(width);
5087 while (rb_funcall(b, op, 1, e)) {
5088 args[1] = b;
5089 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5090 b = rb_funcallv(b, succ, 0, 0);
5091 }
5092 }
5093 return beg;
5094 }
5095 /* normal case */
5096 n = rb_str_cmp(beg, end);
5097 if (n > 0 || (excl && n == 0)) return beg;
5098
5099 after_end = rb_funcallv(end, succ, 0, 0);
5100 current = str_duplicate(rb_cString, beg);
5101 while (!rb_str_equal(current, after_end)) {
5102 VALUE next = Qnil;
5103 if (excl || !rb_str_equal(current, end))
5104 next = rb_funcallv(current, succ, 0, 0);
5105 if ((*each)(current, arg)) break;
5106 if (NIL_P(next)) break;
5107 current = next;
5108 StringValue(current);
5109 if (excl && rb_str_equal(current, end)) break;
5110 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5111 break;
5112 }
5113
5114 return beg;
5115}
5116
5117VALUE
5118rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5119{
5120 VALUE current;
5121 ID succ;
5122
5123 CONST_ID(succ, "succ");
5124 /* both edges are all digits */
5125 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5126 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5127 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5128 int width = RSTRING_LENINT(beg);
5129 b = rb_str_to_inum(beg, 10, FALSE);
5130 if (FIXNUM_P(b)) {
5131 long bi = FIX2LONG(b);
5132 rb_encoding *usascii = rb_usascii_encoding();
5133
5134 while (FIXABLE(bi)) {
5135 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5136 bi++;
5137 }
5138 b = LONG2NUM(bi);
5139 }
5140 args[0] = INT2FIX(width);
5141 while (1) {
5142 args[1] = b;
5143 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5144 b = rb_funcallv(b, succ, 0, 0);
5145 }
5146 }
5147 /* normal case */
5148 current = str_duplicate(rb_cString, beg);
5149 while (1) {
5150 VALUE next = rb_funcallv(current, succ, 0, 0);
5151 if ((*each)(current, arg)) break;
5152 current = next;
5153 StringValue(current);
5154 if (RSTRING_LEN(current) == 0)
5155 break;
5156 }
5157
5158 return beg;
5159}
5160
5161static int
5162include_range_i(VALUE str, VALUE arg)
5163{
5164 VALUE *argp = (VALUE *)arg;
5165 if (!rb_equal(str, *argp)) return 0;
5166 *argp = Qnil;
5167 return 1;
5168}
5169
5170VALUE
5171rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5172{
5173 beg = rb_str_new_frozen(beg);
5174 StringValue(end);
5175 end = rb_str_new_frozen(end);
5176 if (NIL_P(val)) return Qfalse;
5177 val = rb_check_string_type(val);
5178 if (NIL_P(val)) return Qfalse;
5179 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5180 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5181 rb_enc_asciicompat(STR_ENC_GET(val))) {
5182 const char *bp = RSTRING_PTR(beg);
5183 const char *ep = RSTRING_PTR(end);
5184 const char *vp = RSTRING_PTR(val);
5185 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5186 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5187 return Qfalse;
5188 else {
5189 char b = *bp;
5190 char e = *ep;
5191 char v = *vp;
5192
5193 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5194 if (b <= v && v < e) return Qtrue;
5195 return RBOOL(!RTEST(exclusive) && v == e);
5196 }
5197 }
5198 }
5199#if 0
5200 /* both edges are all digits */
5201 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5202 all_digits_p(bp, RSTRING_LEN(beg)) &&
5203 all_digits_p(ep, RSTRING_LEN(end))) {
5204 /* TODO */
5205 }
5206#endif
5207 }
5208 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5209
5210 return RBOOL(NIL_P(val));
5211}
5212
5213static VALUE
5214rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5215{
5216 if (rb_reg_search(re, str, 0, 0) >= 0) {
5217 VALUE match = rb_backref_get();
5218 int nth = rb_reg_backref_number(match, backref);
5219 return rb_reg_nth_match(nth, match);
5220 }
5221 return Qnil;
5222}
5223
5224static VALUE
5225rb_str_aref(VALUE str, VALUE indx)
5226{
5227 long idx;
5228
5229 if (FIXNUM_P(indx)) {
5230 idx = FIX2LONG(indx);
5231 }
5232 else if (RB_TYPE_P(indx, T_REGEXP)) {
5233 return rb_str_subpat(str, indx, INT2FIX(0));
5234 }
5235 else if (RB_TYPE_P(indx, T_STRING)) {
5236 if (rb_str_index(str, indx, 0) != -1)
5237 return str_duplicate(rb_cString, indx);
5238 return Qnil;
5239 }
5240 else {
5241 /* check if indx is Range */
5242 long beg, len = str_strlen(str, NULL);
5243 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5244 case Qfalse:
5245 break;
5246 case Qnil:
5247 return Qnil;
5248 default:
5249 return rb_str_substr(str, beg, len);
5250 }
5251 idx = NUM2LONG(indx);
5252 }
5253
5254 return str_substr(str, idx, 1, FALSE);
5255}
5256
5257
5258/*
5259 * call-seq:
5260 * string[index] -> new_string or nil
5261 * string[start, length] -> new_string or nil
5262 * string[range] -> new_string or nil
5263 * string[regexp, capture = 0] -> new_string or nil
5264 * string[substring] -> new_string or nil
5265 *
5266 * Returns the substring of +self+ specified by the arguments.
5267 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5268 *
5269 *
5270 */
5271
5272static VALUE
5273rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5274{
5275 if (argc == 2) {
5276 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5277 return rb_str_subpat(str, argv[0], argv[1]);
5278 }
5279 else {
5280 long beg = NUM2LONG(argv[0]);
5281 long len = NUM2LONG(argv[1]);
5282 return rb_str_substr(str, beg, len);
5283 }
5284 }
5285 rb_check_arity(argc, 1, 2);
5286 return rb_str_aref(str, argv[0]);
5287}
5288
5289VALUE
5291{
5292 char *ptr = RSTRING_PTR(str);
5293 long olen = RSTRING_LEN(str), nlen;
5294
5295 str_modifiable(str);
5296 if (len > olen) len = olen;
5297 nlen = olen - len;
5298 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5299 char *oldptr = ptr;
5300 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5301 STR_SET_EMBED(str);
5302 ptr = RSTRING(str)->as.embed.ary;
5303 memmove(ptr, oldptr + len, nlen);
5304 if (fl == STR_NOEMBED) xfree(oldptr);
5305 }
5306 else {
5307 if (!STR_SHARED_P(str)) {
5308 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5309 rb_enc_cr_str_exact_copy(shared, str);
5310 OBJ_FREEZE(shared);
5311 }
5312 ptr = RSTRING(str)->as.heap.ptr += len;
5313 }
5314 STR_SET_LEN(str, nlen);
5315
5316 if (!SHARABLE_MIDDLE_SUBSTRING) {
5317 TERM_FILL(ptr + nlen, TERM_LEN(str));
5318 }
5320 return str;
5321}
5322
5323static void
5324rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5325{
5326 char *sptr;
5327 long slen;
5328 int cr;
5329
5330 if (beg == 0 && vlen == 0) {
5331 rb_str_drop_bytes(str, len);
5332 return;
5333 }
5334
5335 str_modify_keep_cr(str);
5336 RSTRING_GETMEM(str, sptr, slen);
5337 if (len < vlen) {
5338 /* expand string */
5339 RESIZE_CAPA(str, slen + vlen - len);
5340 sptr = RSTRING_PTR(str);
5341 }
5342
5344 cr = rb_enc_str_coderange(val);
5345 else
5347
5348 if (vlen != len) {
5349 memmove(sptr + beg + vlen,
5350 sptr + beg + len,
5351 slen - (beg + len));
5352 }
5353 if (vlen < beg && len < 0) {
5354 MEMZERO(sptr + slen, char, -len);
5355 }
5356 if (vlen > 0) {
5357 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5358 }
5359 slen += vlen - len;
5360 STR_SET_LEN(str, slen);
5361 TERM_FILL(&sptr[slen], TERM_LEN(str));
5362 ENC_CODERANGE_SET(str, cr);
5363}
5364
5365static inline void
5366rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5367{
5368 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5369}
5370
5371void
5372rb_str_update(VALUE str, long beg, long len, VALUE val)
5373{
5374 long slen;
5375 char *p, *e;
5376 rb_encoding *enc;
5377 int singlebyte = single_byte_optimizable(str);
5378 int cr;
5379
5380 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5381
5382 StringValue(val);
5383 enc = rb_enc_check(str, val);
5384 slen = str_strlen(str, enc); /* rb_enc_check */
5385
5386 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5387 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5388 }
5389 if (beg < 0) {
5390 beg += slen;
5391 }
5392 assert(beg >= 0);
5393 assert(beg <= slen);
5394 if (len > slen - beg) {
5395 len = slen - beg;
5396 }
5397 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5398 if (!p) p = RSTRING_END(str);
5399 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5400 if (!e) e = RSTRING_END(str);
5401 /* error check */
5402 beg = p - RSTRING_PTR(str); /* physical position */
5403 len = e - p; /* physical length */
5404 rb_str_update_0(str, beg, len, val);
5405 rb_enc_associate(str, enc);
5407 if (cr != ENC_CODERANGE_BROKEN)
5408 ENC_CODERANGE_SET(str, cr);
5409}
5410
5411static void
5412rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5413{
5414 int nth;
5415 VALUE match;
5416 long start, end, len;
5417 rb_encoding *enc;
5418 struct re_registers *regs;
5419
5420 if (rb_reg_search(re, str, 0, 0) < 0) {
5421 rb_raise(rb_eIndexError, "regexp not matched");
5422 }
5423 match = rb_backref_get();
5424 nth = rb_reg_backref_number(match, backref);
5425 regs = RMATCH_REGS(match);
5426 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5427 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5428 }
5429 if (nth < 0) {
5430 nth += regs->num_regs;
5431 }
5432
5433 start = BEG(nth);
5434 if (start == -1) {
5435 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5436 }
5437 end = END(nth);
5438 len = end - start;
5439 StringValue(val);
5440 enc = rb_enc_check_str(str, val);
5441 rb_str_update_0(str, start, len, val);
5442 rb_enc_associate(str, enc);
5443}
5444
5445static VALUE
5446rb_str_aset(VALUE str, VALUE indx, VALUE val)
5447{
5448 long idx, beg;
5449
5450 switch (TYPE(indx)) {
5451 case T_REGEXP:
5452 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5453 return val;
5454
5455 case T_STRING:
5456 beg = rb_str_index(str, indx, 0);
5457 if (beg < 0) {
5458 rb_raise(rb_eIndexError, "string not matched");
5459 }
5460 beg = rb_str_sublen(str, beg);
5461 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5462 return val;
5463
5464 default:
5465 /* check if indx is Range */
5466 {
5467 long beg, len;
5468 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5469 rb_str_update(str, beg, len, val);
5470 return val;
5471 }
5472 }
5473 /* FALLTHROUGH */
5474
5475 case T_FIXNUM:
5476 idx = NUM2LONG(indx);
5477 rb_str_update(str, idx, 1, val);
5478 return val;
5479 }
5480}
5481
5482/*
5483 * call-seq:
5484 * string[index] = new_string
5485 * string[start, length] = new_string
5486 * string[range] = new_string
5487 * string[regexp, capture = 0] = new_string
5488 * string[substring] = new_string
5489 *
5490 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5491 * See {String Slices}[rdoc-ref:String@String+Slices].
5492 *
5493 * A few examples:
5494 *
5495 * s = 'foo'
5496 * s[2] = 'rtune' # => "rtune"
5497 * s # => "fortune"
5498 * s[1, 5] = 'init' # => "init"
5499 * s # => "finite"
5500 * s[3..4] = 'al' # => "al"
5501 * s # => "finale"
5502 * s[/e$/] = 'ly' # => "ly"
5503 * s # => "finally"
5504 * s['lly'] = 'ncial' # => "ncial"
5505 * s # => "financial"
5506 *
5507 */
5508
5509static VALUE
5510rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5511{
5512 if (argc == 3) {
5513 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5514 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5515 }
5516 else {
5517 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5518 }
5519 return argv[2];
5520 }
5521 rb_check_arity(argc, 2, 3);
5522 return rb_str_aset(str, argv[0], argv[1]);
5523}
5524
5525/*
5526 * call-seq:
5527 * insert(index, other_string) -> self
5528 *
5529 * Inserts the given +other_string+ into +self+; returns +self+.
5530 *
5531 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5532 *
5533 * 'foo'.insert(1, 'bar') # => "fbaroo"
5534 *
5535 * If the Integer +index+ is negative, counts backward from the end of +self+
5536 * and inserts +other_string+ at offset <tt>index+1</tt>
5537 * (that is, _after_ <tt>self[index]</tt>):
5538 *
5539 * 'foo'.insert(-2, 'bar') # => "fobaro"
5540 *
5541 */
5542
5543static VALUE
5544rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5545{
5546 long pos = NUM2LONG(idx);
5547
5548 if (pos == -1) {
5549 return rb_str_append(str, str2);
5550 }
5551 else if (pos < 0) {
5552 pos++;
5553 }
5554 rb_str_update(str, pos, 0, str2);
5555 return str;
5556}
5557
5558
5559/*
5560 * call-seq:
5561 * slice!(index) -> new_string or nil
5562 * slice!(start, length) -> new_string or nil
5563 * slice!(range) -> new_string or nil
5564 * slice!(regexp, capture = 0) -> new_string or nil
5565 * slice!(substring) -> new_string or nil
5566 *
5567 * Removes and returns the substring of +self+ specified by the arguments.
5568 * See {String Slices}[rdoc-ref:String@String+Slices].
5569 *
5570 * A few examples:
5571 *
5572 * string = "This is a string"
5573 * string.slice!(2) #=> "i"
5574 * string.slice!(3..6) #=> " is "
5575 * string.slice!(/s.*t/) #=> "sa st"
5576 * string.slice!("r") #=> "r"
5577 * string #=> "Thing"
5578 *
5579 */
5580
5581static VALUE
5582rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5583{
5584 VALUE result = Qnil;
5585 VALUE indx;
5586 long beg, len = 1;
5587 char *p;
5588
5589 rb_check_arity(argc, 1, 2);
5590 str_modify_keep_cr(str);
5591 indx = argv[0];
5592 if (RB_TYPE_P(indx, T_REGEXP)) {
5593 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5594 VALUE match = rb_backref_get();
5595 struct re_registers *regs = RMATCH_REGS(match);
5596 int nth = 0;
5597 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5598 if ((nth += regs->num_regs) <= 0) return Qnil;
5599 }
5600 else if (nth >= regs->num_regs) return Qnil;
5601 beg = BEG(nth);
5602 len = END(nth) - beg;
5603 goto subseq;
5604 }
5605 else if (argc == 2) {
5606 beg = NUM2LONG(indx);
5607 len = NUM2LONG(argv[1]);
5608 goto num_index;
5609 }
5610 else if (FIXNUM_P(indx)) {
5611 beg = FIX2LONG(indx);
5612 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5613 if (!len) return Qnil;
5614 beg = p - RSTRING_PTR(str);
5615 goto subseq;
5616 }
5617 else if (RB_TYPE_P(indx, T_STRING)) {
5618 beg = rb_str_index(str, indx, 0);
5619 if (beg == -1) return Qnil;
5620 len = RSTRING_LEN(indx);
5621 result = str_duplicate(rb_cString, indx);
5622 goto squash;
5623 }
5624 else {
5625 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5626 case Qnil:
5627 return Qnil;
5628 case Qfalse:
5629 beg = NUM2LONG(indx);
5630 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5631 if (!len) return Qnil;
5632 beg = p - RSTRING_PTR(str);
5633 goto subseq;
5634 default:
5635 goto num_index;
5636 }
5637 }
5638
5639 num_index:
5640 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5641 beg = p - RSTRING_PTR(str);
5642
5643 subseq:
5644 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5645 rb_enc_cr_str_copy_for_substr(result, str);
5646
5647 squash:
5648 if (len > 0) {
5649 if (beg == 0) {
5650 rb_str_drop_bytes(str, len);
5651 }
5652 else {
5653 char *sptr = RSTRING_PTR(str);
5654 long slen = RSTRING_LEN(str);
5655 if (beg + len > slen) /* pathological check */
5656 len = slen - beg;
5657 memmove(sptr + beg,
5658 sptr + beg + len,
5659 slen - (beg + len));
5660 slen -= len;
5661 STR_SET_LEN(str, slen);
5662 TERM_FILL(&sptr[slen], TERM_LEN(str));
5663 }
5664 }
5665 return result;
5666}
5667
5668static VALUE
5669get_pat(VALUE pat)
5670{
5671 VALUE val;
5672
5673 switch (OBJ_BUILTIN_TYPE(pat)) {
5674 case T_REGEXP:
5675 return pat;
5676
5677 case T_STRING:
5678 break;
5679
5680 default:
5681 val = rb_check_string_type(pat);
5682 if (NIL_P(val)) {
5683 Check_Type(pat, T_REGEXP);
5684 }
5685 pat = val;
5686 }
5687
5688 return rb_reg_regcomp(pat);
5689}
5690
5691static VALUE
5692get_pat_quoted(VALUE pat, int check)
5693{
5694 VALUE val;
5695
5696 switch (OBJ_BUILTIN_TYPE(pat)) {
5697 case T_REGEXP:
5698 return pat;
5699
5700 case T_STRING:
5701 break;
5702
5703 default:
5704 val = rb_check_string_type(pat);
5705 if (NIL_P(val)) {
5706 Check_Type(pat, T_REGEXP);
5707 }
5708 pat = val;
5709 }
5710 if (check && is_broken_string(pat)) {
5711 rb_exc_raise(rb_reg_check_preprocess(pat));
5712 }
5713 return pat;
5714}
5715
5716static long
5717rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5718{
5719 if (BUILTIN_TYPE(pat) == T_STRING) {
5720 pos = rb_str_byteindex(str, pat, pos);
5721 if (set_backref_str) {
5722 if (pos >= 0) {
5723 str = rb_str_new_frozen_String(str);
5724 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5725 }
5726 else {
5728 }
5729 }
5730 return pos;
5731 }
5732 else {
5733 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5734 }
5735}
5736
5737
5738/*
5739 * call-seq:
5740 * sub!(pattern, replacement) -> self or nil
5741 * sub!(pattern) {|match| ... } -> self or nil
5742 *
5743 * Returns +self+ with only the first occurrence
5744 * (not all occurrences) of the given +pattern+ replaced.
5745 *
5746 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5747 *
5748 * Related: String#sub, String#gsub, String#gsub!.
5749 *
5750 */
5751
5752static VALUE
5753rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5754{
5755 VALUE pat, repl, hash = Qnil;
5756 int iter = 0;
5757 long plen;
5758 int min_arity = rb_block_given_p() ? 1 : 2;
5759 long beg;
5760
5761 rb_check_arity(argc, min_arity, 2);
5762 if (argc == 1) {
5763 iter = 1;
5764 }
5765 else {
5766 repl = argv[1];
5767 hash = rb_check_hash_type(argv[1]);
5768 if (NIL_P(hash)) {
5769 StringValue(repl);
5770 }
5771 }
5772
5773 pat = get_pat_quoted(argv[0], 1);
5774
5775 str_modifiable(str);
5776 beg = rb_pat_search(pat, str, 0, 1);
5777 if (beg >= 0) {
5778 rb_encoding *enc;
5779 int cr = ENC_CODERANGE(str);
5780 long beg0, end0;
5781 VALUE match, match0 = Qnil;
5782 struct re_registers *regs;
5783 char *p, *rp;
5784 long len, rlen;
5785
5786 match = rb_backref_get();
5787 regs = RMATCH_REGS(match);
5788 if (RB_TYPE_P(pat, T_STRING)) {
5789 beg0 = beg;
5790 end0 = beg0 + RSTRING_LEN(pat);
5791 match0 = pat;
5792 }
5793 else {
5794 beg0 = BEG(0);
5795 end0 = END(0);
5796 if (iter) match0 = rb_reg_nth_match(0, match);
5797 }
5798
5799 if (iter || !NIL_P(hash)) {
5800 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5801
5802 if (iter) {
5803 repl = rb_obj_as_string(rb_yield(match0));
5804 }
5805 else {
5806 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5807 repl = rb_obj_as_string(repl);
5808 }
5809 str_mod_check(str, p, len);
5810 rb_check_frozen(str);
5811 }
5812 else {
5813 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5814 }
5815
5816 enc = rb_enc_compatible(str, repl);
5817 if (!enc) {
5818 rb_encoding *str_enc = STR_ENC_GET(str);
5819 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5820 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5821 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5822 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5823 rb_enc_name(str_enc),
5824 rb_enc_name(STR_ENC_GET(repl)));
5825 }
5826 enc = STR_ENC_GET(repl);
5827 }
5828 rb_str_modify(str);
5829 rb_enc_associate(str, enc);
5831 int cr2 = ENC_CODERANGE(repl);
5832 if (cr2 == ENC_CODERANGE_BROKEN ||
5833 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5835 else
5836 cr = cr2;
5837 }
5838 plen = end0 - beg0;
5839 rlen = RSTRING_LEN(repl);
5840 len = RSTRING_LEN(str);
5841 if (rlen > plen) {
5842 RESIZE_CAPA(str, len + rlen - plen);
5843 }
5844 p = RSTRING_PTR(str);
5845 if (rlen != plen) {
5846 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5847 }
5848 rp = RSTRING_PTR(repl);
5849 memmove(p + beg0, rp, rlen);
5850 len += rlen - plen;
5851 STR_SET_LEN(str, len);
5852 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5853 ENC_CODERANGE_SET(str, cr);
5854
5855 RB_GC_GUARD(match);
5856
5857 return str;
5858 }
5859 return Qnil;
5860}
5861
5862
5863/*
5864 * call-seq:
5865 * sub(pattern, replacement) -> new_string
5866 * sub(pattern) {|match| ... } -> new_string
5867 *
5868 * Returns a copy of +self+ with only the first occurrence
5869 * (not all occurrences) of the given +pattern+ replaced.
5870 *
5871 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5872 *
5873 * Related: String#sub!, String#gsub, String#gsub!.
5874 *
5875 */
5876
5877static VALUE
5878rb_str_sub(int argc, VALUE *argv, VALUE str)
5879{
5880 str = str_duplicate(rb_cString, str);
5881 rb_str_sub_bang(argc, argv, str);
5882 return str;
5883}
5884
5885static VALUE
5886str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5887{
5888 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5889 long beg, beg0, end0;
5890 long offset, blen, slen, len, last;
5891 enum {STR, ITER, MAP} mode = STR;
5892 char *sp, *cp;
5893 int need_backref = -1;
5894 rb_encoding *str_enc;
5895
5896 switch (argc) {
5897 case 1:
5898 RETURN_ENUMERATOR(str, argc, argv);
5899 mode = ITER;
5900 break;
5901 case 2:
5902 repl = argv[1];
5903 hash = rb_check_hash_type(argv[1]);
5904 if (NIL_P(hash)) {
5905 StringValue(repl);
5906 }
5907 else {
5908 mode = MAP;
5909 }
5910 break;
5911 default:
5912 rb_error_arity(argc, 1, 2);
5913 }
5914
5915 pat = get_pat_quoted(argv[0], 1);
5916 beg = rb_pat_search(pat, str, 0, need_backref);
5917 if (beg < 0) {
5918 if (bang) return Qnil; /* no match, no substitution */
5919 return str_duplicate(rb_cString, str);
5920 }
5921
5922 offset = 0;
5923 blen = RSTRING_LEN(str) + 30; /* len + margin */
5924 dest = rb_str_buf_new(blen);
5925 sp = RSTRING_PTR(str);
5926 slen = RSTRING_LEN(str);
5927 cp = sp;
5928 str_enc = STR_ENC_GET(str);
5929 rb_enc_associate(dest, str_enc);
5930 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5931
5932 do {
5933 VALUE match = rb_backref_get();
5934 struct re_registers *regs = RMATCH_REGS(match);
5935 if (RB_TYPE_P(pat, T_STRING)) {
5936 beg0 = beg;
5937 end0 = beg0 + RSTRING_LEN(pat);
5938 match0 = pat;
5939 }
5940 else {
5941 beg0 = BEG(0);
5942 end0 = END(0);
5943 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5944 }
5945
5946 if (mode) {
5947 if (mode == ITER) {
5948 val = rb_obj_as_string(rb_yield(match0));
5949 }
5950 else {
5951 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5952 val = rb_obj_as_string(val);
5953 }
5954 str_mod_check(str, sp, slen);
5955 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5956 rb_raise(rb_eRuntimeError, "block should not cheat");
5957 }
5958 }
5959 else if (need_backref) {
5960 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5961 if (need_backref < 0) {
5962 need_backref = val != repl;
5963 }
5964 }
5965 else {
5966 val = repl;
5967 }
5968
5969 len = beg0 - offset; /* copy pre-match substr */
5970 if (len) {
5971 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5972 }
5973
5974 rb_str_buf_append(dest, val);
5975
5976 last = offset;
5977 offset = end0;
5978 if (beg0 == end0) {
5979 /*
5980 * Always consume at least one character of the input string
5981 * in order to prevent infinite loops.
5982 */
5983 if (RSTRING_LEN(str) <= end0) break;
5984 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5985 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5986 offset = end0 + len;
5987 }
5988 cp = RSTRING_PTR(str) + offset;
5989 if (offset > RSTRING_LEN(str)) break;
5990 beg = rb_pat_search(pat, str, offset, need_backref);
5991
5992 RB_GC_GUARD(match);
5993 } while (beg >= 0);
5994 if (RSTRING_LEN(str) > offset) {
5995 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5996 }
5997 rb_pat_search(pat, str, last, 1);
5998 if (bang) {
5999 str_shared_replace(str, dest);
6000 }
6001 else {
6002 str = dest;
6003 }
6004
6005 return str;
6006}
6007
6008
6009/*
6010 * call-seq:
6011 * gsub!(pattern, replacement) -> self or nil
6012 * gsub!(pattern) {|match| ... } -> self or nil
6013 * gsub!(pattern) -> an_enumerator
6014 *
6015 * Performs the specified substring replacement(s) on +self+;
6016 * returns +self+ if any replacement occurred, +nil+ otherwise.
6017 *
6018 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6019 *
6020 * Returns an Enumerator if no +replacement+ and no block given.
6021 *
6022 * Related: String#sub, String#gsub, String#sub!.
6023 *
6024 */
6025
6026static VALUE
6027rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
6028{
6029 str_modify_keep_cr(str);
6030 return str_gsub(argc, argv, str, 1);
6031}
6032
6033
6034/*
6035 * call-seq:
6036 * gsub(pattern, replacement) -> new_string
6037 * gsub(pattern) {|match| ... } -> new_string
6038 * gsub(pattern) -> enumerator
6039 *
6040 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6041 *
6042 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6043 *
6044 * Returns an Enumerator if no +replacement+ and no block given.
6045 *
6046 * Related: String#sub, String#sub!, String#gsub!.
6047 *
6048 */
6049
6050static VALUE
6051rb_str_gsub(int argc, VALUE *argv, VALUE str)
6052{
6053 return str_gsub(argc, argv, str, 0);
6054}
6055
6056
6057/*
6058 * call-seq:
6059 * replace(other_string) -> self
6060 *
6061 * Replaces the contents of +self+ with the contents of +other_string+:
6062 *
6063 * s = 'foo' # => "foo"
6064 * s.replace('bar') # => "bar"
6065 *
6066 */
6067
6068VALUE
6070{
6071 str_modifiable(str);
6072 if (str == str2) return str;
6073
6074 StringValue(str2);
6075 str_discard(str);
6076 return str_replace(str, str2);
6077}
6078
6079/*
6080 * call-seq:
6081 * clear -> self
6082 *
6083 * Removes the contents of +self+:
6084 *
6085 * s = 'foo' # => "foo"
6086 * s.clear # => ""
6087 *
6088 */
6089
6090static VALUE
6091rb_str_clear(VALUE str)
6092{
6093 str_discard(str);
6094 STR_SET_EMBED(str);
6095 STR_SET_LEN(str, 0);
6096 RSTRING_PTR(str)[0] = 0;
6097 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6099 else
6101 return str;
6102}
6103
6104/*
6105 * call-seq:
6106 * chr -> string
6107 *
6108 * Returns a string containing the first character of +self+:
6109 *
6110 * s = 'foo' # => "foo"
6111 * s.chr # => "f"
6112 *
6113 */
6114
6115static VALUE
6116rb_str_chr(VALUE str)
6117{
6118 return rb_str_substr(str, 0, 1);
6119}
6120
6121/*
6122 * call-seq:
6123 * getbyte(index) -> integer or nil
6124 *
6125 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6126 *
6127 * s = 'abcde' # => "abcde"
6128 * s.getbyte(0) # => 97
6129 * s.getbyte(-1) # => 101
6130 * s.getbyte(5) # => nil
6131 *
6132 * Related: String#setbyte.
6133 */
6134VALUE
6135rb_str_getbyte(VALUE str, VALUE index)
6136{
6137 long pos = NUM2LONG(index);
6138
6139 if (pos < 0)
6140 pos += RSTRING_LEN(str);
6141 if (pos < 0 || RSTRING_LEN(str) <= pos)
6142 return Qnil;
6143
6144 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6145}
6146
6147/*
6148 * call-seq:
6149 * setbyte(index, integer) -> integer
6150 *
6151 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6152 *
6153 * s = 'abcde' # => "abcde"
6154 * s.setbyte(0, 98) # => 98
6155 * s # => "bbcde"
6156 *
6157 * Related: String#getbyte.
6158 */
6159static VALUE
6160rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6161{
6162 long pos = NUM2LONG(index);
6163 long len = RSTRING_LEN(str);
6164 char *ptr, *head, *left = 0;
6165 rb_encoding *enc;
6166 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6167
6168 if (pos < -len || len <= pos)
6169 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6170 if (pos < 0)
6171 pos += len;
6172
6173 VALUE v = rb_to_int(value);
6174 VALUE w = rb_int_and(v, INT2FIX(0xff));
6175 char byte = (char)(NUM2INT(w) & 0xFF);
6176
6177 if (!str_independent(str))
6178 str_make_independent(str);
6179 enc = STR_ENC_GET(str);
6180 head = RSTRING_PTR(str);
6181 ptr = &head[pos];
6182 if (!STR_EMBED_P(str)) {
6183 cr = ENC_CODERANGE(str);
6184 switch (cr) {
6185 case ENC_CODERANGE_7BIT:
6186 left = ptr;
6187 *ptr = byte;
6188 if (ISASCII(byte)) goto end;
6189 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6190 if (!MBCLEN_CHARFOUND_P(nlen))
6192 else
6194 goto end;
6196 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6197 width = rb_enc_precise_mbclen(left, head+len, enc);
6198 *ptr = byte;
6199 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6200 if (!MBCLEN_CHARFOUND_P(nlen))
6202 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6204 goto end;
6205 }
6206 }
6208 *ptr = byte;
6209
6210 end:
6211 return value;
6212}
6213
6214static VALUE
6215str_byte_substr(VALUE str, long beg, long len, int empty)
6216{
6217 long n = RSTRING_LEN(str);
6218
6219 if (beg > n || len < 0) return Qnil;
6220 if (beg < 0) {
6221 beg += n;
6222 if (beg < 0) return Qnil;
6223 }
6224 if (len > n - beg)
6225 len = n - beg;
6226 if (len <= 0) {
6227 if (!empty) return Qnil;
6228 len = 0;
6229 }
6230
6231 VALUE str2 = str_subseq(str, beg, len);
6232
6233 str_enc_copy_direct(str2, str);
6234
6235 if (RSTRING_LEN(str2) == 0) {
6236 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6238 else
6240 }
6241 else {
6242 switch (ENC_CODERANGE(str)) {
6243 case ENC_CODERANGE_7BIT:
6245 break;
6246 default:
6248 break;
6249 }
6250 }
6251
6252 return str2;
6253}
6254
6255static VALUE
6256str_byte_aref(VALUE str, VALUE indx)
6257{
6258 long idx;
6259 if (FIXNUM_P(indx)) {
6260 idx = FIX2LONG(indx);
6261 }
6262 else {
6263 /* check if indx is Range */
6264 long beg, len = RSTRING_LEN(str);
6265
6266 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6267 case Qfalse:
6268 break;
6269 case Qnil:
6270 return Qnil;
6271 default:
6272 return str_byte_substr(str, beg, len, TRUE);
6273 }
6274
6275 idx = NUM2LONG(indx);
6276 }
6277 return str_byte_substr(str, idx, 1, FALSE);
6278}
6279
6280/*
6281 * call-seq:
6282 * byteslice(index, length = 1) -> string or nil
6283 * byteslice(range) -> string or nil
6284 *
6285 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6286 *
6287 * With integer arguments +index+ and +length+ given,
6288 * returns the substring beginning at the given +index+
6289 * of the given +length+ (if possible),
6290 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6291 *
6292 * s = '0123456789' # => "0123456789"
6293 * s.byteslice(2) # => "2"
6294 * s.byteslice(200) # => nil
6295 * s.byteslice(4, 3) # => "456"
6296 * s.byteslice(4, 30) # => "456789"
6297 * s.byteslice(4, -1) # => nil
6298 * s.byteslice(40, 2) # => nil
6299 *
6300 * In either case above, counts backwards from the end of +self+
6301 * if +index+ is negative:
6302 *
6303 * s = '0123456789' # => "0123456789"
6304 * s.byteslice(-4) # => "6"
6305 * s.byteslice(-4, 3) # => "678"
6306 *
6307 * With Range argument +range+ given, returns
6308 * <tt>byteslice(range.begin, range.size)</tt>:
6309 *
6310 * s = '0123456789' # => "0123456789"
6311 * s.byteslice(4..6) # => "456"
6312 * s.byteslice(-6..-4) # => "456"
6313 * s.byteslice(5..2) # => "" # range.size is zero.
6314 * s.byteslice(40..42) # => nil
6315 *
6316 * In all cases, a returned string has the same encoding as +self+:
6317 *
6318 * s.encoding # => #<Encoding:UTF-8>
6319 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6320 *
6321 */
6322
6323static VALUE
6324rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6325{
6326 if (argc == 2) {
6327 long beg = NUM2LONG(argv[0]);
6328 long len = NUM2LONG(argv[1]);
6329 return str_byte_substr(str, beg, len, TRUE);
6330 }
6331 rb_check_arity(argc, 1, 2);
6332 return str_byte_aref(str, argv[0]);
6333}
6334
6335static void
6336str_check_beg_len(VALUE str, long *beg, long *len)
6337{
6338 long end, slen = RSTRING_LEN(str);
6339
6340 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6341 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6342 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6343 }
6344 if (*beg < 0) {
6345 *beg += slen;
6346 }
6347 assert(*beg >= 0);
6348 assert(*beg <= slen);
6349 if (*len > slen - *beg) {
6350 *len = slen - *beg;
6351 }
6352 end = *beg + *len;
6353 str_ensure_byte_pos(str, *beg);
6354 str_ensure_byte_pos(str, end);
6355}
6356
6357/*
6358 * call-seq:
6359 * bytesplice(index, length, str) -> string
6360 * bytesplice(index, length, str, str_index, str_length) -> string
6361 * bytesplice(range, str) -> string
6362 * bytesplice(range, str, str_range) -> string
6363 *
6364 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6365 * The portion of the string affected is determined using
6366 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6367 * If the replacement string is not the same length as the text it is replacing,
6368 * the string will be adjusted accordingly.
6369 *
6370 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6371 *
6372 * The form that take an Integer will raise an IndexError if the value is out
6373 * of range; the Range form will raise a RangeError.
6374 * If the beginning or ending offset does not land on character (codepoint)
6375 * boundary, an IndexError will be raised.
6376 */
6377
6378static VALUE
6379rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6380{
6381 long beg, len, vbeg, vlen;
6382 VALUE val;
6383 rb_encoding *enc;
6384 int cr;
6385
6386 rb_check_arity(argc, 2, 5);
6387 if (!(argc == 2 || argc == 3 || argc == 5)) {
6388 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6389 }
6390 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6391 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6392 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6393 rb_builtin_class_name(argv[0]));
6394 }
6395 val = argv[1];
6396 StringValue(val);
6397 if (argc == 2) {
6398 /* bytesplice(range, str) */
6399 vbeg = 0;
6400 vlen = RSTRING_LEN(val);
6401 }
6402 else {
6403 /* bytesplice(range, str, str_range) */
6404 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6405 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6406 rb_builtin_class_name(argv[2]));
6407 }
6408 }
6409 }
6410 else {
6411 beg = NUM2LONG(argv[0]);
6412 len = NUM2LONG(argv[1]);
6413 val = argv[2];
6414 StringValue(val);
6415 if (argc == 3) {
6416 /* bytesplice(index, length, str) */
6417 vbeg = 0;
6418 vlen = RSTRING_LEN(val);
6419 }
6420 else {
6421 /* bytesplice(index, length, str, str_index, str_length) */
6422 vbeg = NUM2LONG(argv[3]);
6423 vlen = NUM2LONG(argv[4]);
6424 }
6425 }
6426 str_check_beg_len(str, &beg, &len);
6427 str_check_beg_len(val, &vbeg, &vlen);
6428 enc = rb_enc_check(str, val);
6429 str_modify_keep_cr(str);
6430 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6431 rb_enc_associate(str, enc);
6433 if (cr != ENC_CODERANGE_BROKEN)
6434 ENC_CODERANGE_SET(str, cr);
6435 return str;
6436}
6437
6438/*
6439 * call-seq:
6440 * reverse -> string
6441 *
6442 * Returns a new string with the characters from +self+ in reverse order.
6443 *
6444 * 'stressed'.reverse # => "desserts"
6445 *
6446 */
6447
6448static VALUE
6449rb_str_reverse(VALUE str)
6450{
6451 rb_encoding *enc;
6452 VALUE rev;
6453 char *s, *e, *p;
6454 int cr;
6455
6456 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6457 enc = STR_ENC_GET(str);
6458 rev = rb_str_new(0, RSTRING_LEN(str));
6459 s = RSTRING_PTR(str); e = RSTRING_END(str);
6460 p = RSTRING_END(rev);
6461 cr = ENC_CODERANGE(str);
6462
6463 if (RSTRING_LEN(str) > 1) {
6464 if (single_byte_optimizable(str)) {
6465 while (s < e) {
6466 *--p = *s++;
6467 }
6468 }
6469 else if (cr == ENC_CODERANGE_VALID) {
6470 while (s < e) {
6471 int clen = rb_enc_fast_mbclen(s, e, enc);
6472
6473 p -= clen;
6474 memcpy(p, s, clen);
6475 s += clen;
6476 }
6477 }
6478 else {
6479 cr = rb_enc_asciicompat(enc) ?
6481 while (s < e) {
6482 int clen = rb_enc_mbclen(s, e, enc);
6483
6484 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6485 p -= clen;
6486 memcpy(p, s, clen);
6487 s += clen;
6488 }
6489 }
6490 }
6491 STR_SET_LEN(rev, RSTRING_LEN(str));
6492 str_enc_copy_direct(rev, str);
6493 ENC_CODERANGE_SET(rev, cr);
6494
6495 return rev;
6496}
6497
6498
6499/*
6500 * call-seq:
6501 * reverse! -> self
6502 *
6503 * Returns +self+ with its characters reversed:
6504 *
6505 * s = 'stressed'
6506 * s.reverse! # => "desserts"
6507 * s # => "desserts"
6508 *
6509 */
6510
6511static VALUE
6512rb_str_reverse_bang(VALUE str)
6513{
6514 if (RSTRING_LEN(str) > 1) {
6515 if (single_byte_optimizable(str)) {
6516 char *s, *e, c;
6517
6518 str_modify_keep_cr(str);
6519 s = RSTRING_PTR(str);
6520 e = RSTRING_END(str) - 1;
6521 while (s < e) {
6522 c = *s;
6523 *s++ = *e;
6524 *e-- = c;
6525 }
6526 }
6527 else {
6528 str_shared_replace(str, rb_str_reverse(str));
6529 }
6530 }
6531 else {
6532 str_modify_keep_cr(str);
6533 }
6534 return str;
6535}
6536
6537
6538/*
6539 * call-seq:
6540 * include? other_string -> true or false
6541 *
6542 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6543 *
6544 * s = 'foo'
6545 * s.include?('f') # => true
6546 * s.include?('fo') # => true
6547 * s.include?('food') # => false
6548 *
6549 */
6550
6551VALUE
6552rb_str_include(VALUE str, VALUE arg)
6553{
6554 long i;
6555
6556 StringValue(arg);
6557 i = rb_str_index(str, arg, 0);
6558
6559 return RBOOL(i != -1);
6560}
6561
6562
6563/*
6564 * call-seq:
6565 * to_i(base = 10) -> integer
6566 *
6567 * Returns the result of interpreting leading characters in +self+
6568 * as an integer in the given +base+ (which must be in (0, 2..36)):
6569 *
6570 * '123456'.to_i # => 123456
6571 * '123def'.to_i(16) # => 1195503
6572 *
6573 * With +base+ zero, string +object+ may contain leading characters
6574 * to specify the actual base:
6575 *
6576 * '123def'.to_i(0) # => 123
6577 * '0123def'.to_i(0) # => 83
6578 * '0b123def'.to_i(0) # => 1
6579 * '0o123def'.to_i(0) # => 83
6580 * '0d123def'.to_i(0) # => 123
6581 * '0x123def'.to_i(0) # => 1195503
6582 *
6583 * Characters past a leading valid number (in the given +base+) are ignored:
6584 *
6585 * '12.345'.to_i # => 12
6586 * '12345'.to_i(2) # => 1
6587 *
6588 * Returns zero if there is no leading valid number:
6589 *
6590 * 'abcdef'.to_i # => 0
6591 * '2'.to_i(2) # => 0
6592 *
6593 */
6594
6595static VALUE
6596rb_str_to_i(int argc, VALUE *argv, VALUE str)
6597{
6598 int base = 10;
6599
6600 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6601 rb_raise(rb_eArgError, "invalid radix %d", base);
6602 }
6603 return rb_str_to_inum(str, base, FALSE);
6604}
6605
6606
6607/*
6608 * call-seq:
6609 * to_f -> float
6610 *
6611 * Returns the result of interpreting leading characters in +self+ as a Float:
6612 *
6613 * '3.14159'.to_f # => 3.14159
6614 * '1.234e-2'.to_f # => 0.01234
6615 *
6616 * Characters past a leading valid number (in the given +base+) are ignored:
6617 *
6618 * '3.14 (pi to two places)'.to_f # => 3.14
6619 *
6620 * Returns zero if there is no leading valid number:
6621 *
6622 * 'abcdef'.to_f # => 0.0
6623 *
6624 */
6625
6626static VALUE
6627rb_str_to_f(VALUE str)
6628{
6629 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6630}
6631
6632
6633/*
6634 * call-seq:
6635 * to_s -> self or string
6636 *
6637 * Returns +self+ if +self+ is a \String,
6638 * or +self+ converted to a \String if +self+ is a subclass of \String.
6639 */
6640
6641static VALUE
6642rb_str_to_s(VALUE str)
6643{
6644 if (rb_obj_class(str) != rb_cString) {
6645 return str_duplicate(rb_cString, str);
6646 }
6647 return str;
6648}
6649
6650#if 0
6651static void
6652str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6653{
6654 char s[RUBY_MAX_CHAR_LEN];
6655 int n = rb_enc_codelen(c, enc);
6656
6657 rb_enc_mbcput(c, s, enc);
6658 rb_enc_str_buf_cat(str, s, n, enc);
6659}
6660#endif
6661
6662#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6663
6664int
6665rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6666{
6667 char buf[CHAR_ESC_LEN + 1];
6668 int l;
6669
6670#if SIZEOF_INT > 4
6671 c &= 0xffffffff;
6672#endif
6673 if (unicode_p) {
6674 if (c < 0x7F && ISPRINT(c)) {
6675 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6676 }
6677 else if (c < 0x10000) {
6678 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6679 }
6680 else {
6681 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6682 }
6683 }
6684 else {
6685 if (c < 0x100) {
6686 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6687 }
6688 else {
6689 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6690 }
6691 }
6692 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6693 rb_str_buf_cat(result, buf, l);
6694 return l;
6695}
6696
6697const char *
6698ruby_escaped_char(int c)
6699{
6700 switch (c) {
6701 case '\0': return "\\0";
6702 case '\n': return "\\n";
6703 case '\r': return "\\r";
6704 case '\t': return "\\t";
6705 case '\f': return "\\f";
6706 case '\013': return "\\v";
6707 case '\010': return "\\b";
6708 case '\007': return "\\a";
6709 case '\033': return "\\e";
6710 case '\x7f': return "\\c?";
6711 }
6712 return NULL;
6713}
6714
6715VALUE
6716rb_str_escape(VALUE str)
6717{
6718 int encidx = ENCODING_GET(str);
6719 rb_encoding *enc = rb_enc_from_index(encidx);
6720 const char *p = RSTRING_PTR(str);
6721 const char *pend = RSTRING_END(str);
6722 const char *prev = p;
6723 char buf[CHAR_ESC_LEN + 1];
6724 VALUE result = rb_str_buf_new(0);
6725 int unicode_p = rb_enc_unicode_p(enc);
6726 int asciicompat = rb_enc_asciicompat(enc);
6727
6728 while (p < pend) {
6729 unsigned int c;
6730 const char *cc;
6731 int n = rb_enc_precise_mbclen(p, pend, enc);
6732 if (!MBCLEN_CHARFOUND_P(n)) {
6733 if (p > prev) str_buf_cat(result, prev, p - prev);
6734 n = rb_enc_mbminlen(enc);
6735 if (pend < p + n)
6736 n = (int)(pend - p);
6737 while (n--) {
6738 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6739 str_buf_cat(result, buf, strlen(buf));
6740 prev = ++p;
6741 }
6742 continue;
6743 }
6744 n = MBCLEN_CHARFOUND_LEN(n);
6745 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6746 p += n;
6747 cc = ruby_escaped_char(c);
6748 if (cc) {
6749 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6750 str_buf_cat(result, cc, strlen(cc));
6751 prev = p;
6752 }
6753 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6754 }
6755 else {
6756 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6757 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6758 prev = p;
6759 }
6760 }
6761 if (p > prev) str_buf_cat(result, prev, p - prev);
6762 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6763
6764 return result;
6765}
6766
6767/*
6768 * call-seq:
6769 * inspect -> string
6770 *
6771 * Returns a printable version of +self+, enclosed in double-quotes,
6772 * and with special characters escaped:
6773 *
6774 * s = "foo\tbar\tbaz\n"
6775 * s.inspect
6776 * # => "\"foo\\tbar\\tbaz\\n\""
6777 *
6778 */
6779
6780VALUE
6782{
6783 int encidx = ENCODING_GET(str);
6784 rb_encoding *enc = rb_enc_from_index(encidx);
6785 const char *p, *pend, *prev;
6786 char buf[CHAR_ESC_LEN + 1];
6787 VALUE result = rb_str_buf_new(0);
6788 rb_encoding *resenc = rb_default_internal_encoding();
6789 int unicode_p = rb_enc_unicode_p(enc);
6790 int asciicompat = rb_enc_asciicompat(enc);
6791
6792 if (resenc == NULL) resenc = rb_default_external_encoding();
6793 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6794 rb_enc_associate(result, resenc);
6795 str_buf_cat2(result, "\"");
6796
6797 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6798 prev = p;
6799 while (p < pend) {
6800 unsigned int c, cc;
6801 int n;
6802
6803 n = rb_enc_precise_mbclen(p, pend, enc);
6804 if (!MBCLEN_CHARFOUND_P(n)) {
6805 if (p > prev) str_buf_cat(result, prev, p - prev);
6806 n = rb_enc_mbminlen(enc);
6807 if (pend < p + n)
6808 n = (int)(pend - p);
6809 while (n--) {
6810 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6811 str_buf_cat(result, buf, strlen(buf));
6812 prev = ++p;
6813 }
6814 continue;
6815 }
6816 n = MBCLEN_CHARFOUND_LEN(n);
6817 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6818 p += n;
6819 if ((asciicompat || unicode_p) &&
6820 (c == '"'|| c == '\\' ||
6821 (c == '#' &&
6822 p < pend &&
6823 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6824 (cc = rb_enc_codepoint(p,pend,enc),
6825 (cc == '$' || cc == '@' || cc == '{'))))) {
6826 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6827 str_buf_cat2(result, "\\");
6828 if (asciicompat || enc == resenc) {
6829 prev = p - n;
6830 continue;
6831 }
6832 }
6833 switch (c) {
6834 case '\n': cc = 'n'; break;
6835 case '\r': cc = 'r'; break;
6836 case '\t': cc = 't'; break;
6837 case '\f': cc = 'f'; break;
6838 case '\013': cc = 'v'; break;
6839 case '\010': cc = 'b'; break;
6840 case '\007': cc = 'a'; break;
6841 case 033: cc = 'e'; break;
6842 default: cc = 0; break;
6843 }
6844 if (cc) {
6845 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6846 buf[0] = '\\';
6847 buf[1] = (char)cc;
6848 str_buf_cat(result, buf, 2);
6849 prev = p;
6850 continue;
6851 }
6852 /* The special casing of 0x85 (NEXT_LINE) here is because
6853 * Oniguruma historically treats it as printable, but it
6854 * doesn't match the print POSIX bracket class or character
6855 * property in regexps.
6856 *
6857 * See Ruby Bug #16842 for details:
6858 * https://bugs.ruby-lang.org/issues/16842
6859 */
6860 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6861 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6862 continue;
6863 }
6864 else {
6865 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6866 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6867 prev = p;
6868 continue;
6869 }
6870 }
6871 if (p > prev) str_buf_cat(result, prev, p - prev);
6872 str_buf_cat2(result, "\"");
6873
6874 return result;
6875}
6876
6877#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6878
6879/*
6880 * call-seq:
6881 * dump -> string
6882 *
6883 * Returns a printable version of +self+, enclosed in double-quotes,
6884 * with special characters escaped, and with non-printing characters
6885 * replaced by hexadecimal notation:
6886 *
6887 * "hello \n ''".dump # => "\"hello \\n ''\""
6888 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6889 *
6890 * Related: String#undump (inverse of String#dump).
6891 *
6892 */
6893
6894VALUE
6896{
6897 int encidx = rb_enc_get_index(str);
6898 rb_encoding *enc = rb_enc_from_index(encidx);
6899 long len;
6900 const char *p, *pend;
6901 char *q, *qend;
6902 VALUE result;
6903 int u8 = (encidx == rb_utf8_encindex());
6904 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6905
6906 len = 2; /* "" */
6907 if (!rb_enc_asciicompat(enc)) {
6908 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6909 len += strlen(enc->name);
6910 }
6911
6912 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6913 while (p < pend) {
6914 int clen;
6915 unsigned char c = *p++;
6916
6917 switch (c) {
6918 case '"': case '\\':
6919 case '\n': case '\r':
6920 case '\t': case '\f':
6921 case '\013': case '\010': case '\007': case '\033':
6922 clen = 2;
6923 break;
6924
6925 case '#':
6926 clen = IS_EVSTR(p, pend) ? 2 : 1;
6927 break;
6928
6929 default:
6930 if (ISPRINT(c)) {
6931 clen = 1;
6932 }
6933 else {
6934 if (u8 && c > 0x7F) { /* \u notation */
6935 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6936 if (MBCLEN_CHARFOUND_P(n)) {
6937 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6938 if (cc <= 0xFFFF)
6939 clen = 6; /* \uXXXX */
6940 else if (cc <= 0xFFFFF)
6941 clen = 9; /* \u{XXXXX} */
6942 else
6943 clen = 10; /* \u{XXXXXX} */
6944 p += MBCLEN_CHARFOUND_LEN(n)-1;
6945 break;
6946 }
6947 }
6948 clen = 4; /* \xNN */
6949 }
6950 break;
6951 }
6952
6953 if (clen > LONG_MAX - len) {
6954 rb_raise(rb_eRuntimeError, "string size too big");
6955 }
6956 len += clen;
6957 }
6958
6959 result = rb_str_new(0, len);
6960 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6961 q = RSTRING_PTR(result); qend = q + len + 1;
6962
6963 *q++ = '"';
6964 while (p < pend) {
6965 unsigned char c = *p++;
6966
6967 if (c == '"' || c == '\\') {
6968 *q++ = '\\';
6969 *q++ = c;
6970 }
6971 else if (c == '#') {
6972 if (IS_EVSTR(p, pend)) *q++ = '\\';
6973 *q++ = '#';
6974 }
6975 else if (c == '\n') {
6976 *q++ = '\\';
6977 *q++ = 'n';
6978 }
6979 else if (c == '\r') {
6980 *q++ = '\\';
6981 *q++ = 'r';
6982 }
6983 else if (c == '\t') {
6984 *q++ = '\\';
6985 *q++ = 't';
6986 }
6987 else if (c == '\f') {
6988 *q++ = '\\';
6989 *q++ = 'f';
6990 }
6991 else if (c == '\013') {
6992 *q++ = '\\';
6993 *q++ = 'v';
6994 }
6995 else if (c == '\010') {
6996 *q++ = '\\';
6997 *q++ = 'b';
6998 }
6999 else if (c == '\007') {
7000 *q++ = '\\';
7001 *q++ = 'a';
7002 }
7003 else if (c == '\033') {
7004 *q++ = '\\';
7005 *q++ = 'e';
7006 }
7007 else if (ISPRINT(c)) {
7008 *q++ = c;
7009 }
7010 else {
7011 *q++ = '\\';
7012 if (u8) {
7013 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
7014 if (MBCLEN_CHARFOUND_P(n)) {
7015 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
7016 p += n;
7017 if (cc <= 0xFFFF)
7018 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
7019 else
7020 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
7021 q += strlen(q);
7022 continue;
7023 }
7024 }
7025 snprintf(q, qend-q, "x%02X", c);
7026 q += 3;
7027 }
7028 }
7029 *q++ = '"';
7030 *q = '\0';
7031 if (!rb_enc_asciicompat(enc)) {
7032 snprintf(q, qend-q, nonascii_suffix, enc->name);
7033 encidx = rb_ascii8bit_encindex();
7034 }
7035 /* result from dump is ASCII */
7036 rb_enc_associate_index(result, encidx);
7038 return result;
7039}
7040
7041static int
7042unescape_ascii(unsigned int c)
7043{
7044 switch (c) {
7045 case 'n':
7046 return '\n';
7047 case 'r':
7048 return '\r';
7049 case 't':
7050 return '\t';
7051 case 'f':
7052 return '\f';
7053 case 'v':
7054 return '\13';
7055 case 'b':
7056 return '\010';
7057 case 'a':
7058 return '\007';
7059 case 'e':
7060 return 033;
7061 }
7063}
7064
7065static void
7066undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7067{
7068 const char *s = *ss;
7069 unsigned int c;
7070 int codelen;
7071 size_t hexlen;
7072 unsigned char buf[6];
7073 static rb_encoding *enc_utf8 = NULL;
7074
7075 switch (*s) {
7076 case '\\':
7077 case '"':
7078 case '#':
7079 rb_str_cat(undumped, s, 1); /* cat itself */
7080 s++;
7081 break;
7082 case 'n':
7083 case 'r':
7084 case 't':
7085 case 'f':
7086 case 'v':
7087 case 'b':
7088 case 'a':
7089 case 'e':
7090 *buf = unescape_ascii(*s);
7091 rb_str_cat(undumped, (char *)buf, 1);
7092 s++;
7093 break;
7094 case 'u':
7095 if (*binary) {
7096 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7097 }
7098 *utf8 = true;
7099 if (++s >= s_end) {
7100 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7101 }
7102 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7103 if (*penc != enc_utf8) {
7104 *penc = enc_utf8;
7105 rb_enc_associate(undumped, enc_utf8);
7106 }
7107 if (*s == '{') { /* handle \u{...} form */
7108 s++;
7109 for (;;) {
7110 if (s >= s_end) {
7111 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7112 }
7113 if (*s == '}') {
7114 s++;
7115 break;
7116 }
7117 if (ISSPACE(*s)) {
7118 s++;
7119 continue;
7120 }
7121 c = scan_hex(s, s_end-s, &hexlen);
7122 if (hexlen == 0 || hexlen > 6) {
7123 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7124 }
7125 if (c > 0x10ffff) {
7126 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7127 }
7128 if (0xd800 <= c && c <= 0xdfff) {
7129 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7130 }
7131 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7132 rb_str_cat(undumped, (char *)buf, codelen);
7133 s += hexlen;
7134 }
7135 }
7136 else { /* handle \uXXXX form */
7137 c = scan_hex(s, 4, &hexlen);
7138 if (hexlen != 4) {
7139 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7140 }
7141 if (0xd800 <= c && c <= 0xdfff) {
7142 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7143 }
7144 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7145 rb_str_cat(undumped, (char *)buf, codelen);
7146 s += hexlen;
7147 }
7148 break;
7149 case 'x':
7150 if (*utf8) {
7151 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7152 }
7153 *binary = true;
7154 if (++s >= s_end) {
7155 rb_raise(rb_eRuntimeError, "invalid hex escape");
7156 }
7157 *buf = scan_hex(s, 2, &hexlen);
7158 if (hexlen != 2) {
7159 rb_raise(rb_eRuntimeError, "invalid hex escape");
7160 }
7161 rb_str_cat(undumped, (char *)buf, 1);
7162 s += hexlen;
7163 break;
7164 default:
7165 rb_str_cat(undumped, s-1, 2);
7166 s++;
7167 }
7168
7169 *ss = s;
7170}
7171
7172static VALUE rb_str_is_ascii_only_p(VALUE str);
7173
7174/*
7175 * call-seq:
7176 * undump -> string
7177 *
7178 * Returns an unescaped version of +self+:
7179 *
7180 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7181 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7182 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7183 * s_undumped == s_orig # => true
7184 *
7185 * Related: String#dump (inverse of String#undump).
7186 *
7187 */
7188
7189static VALUE
7190str_undump(VALUE str)
7191{
7192 const char *s = RSTRING_PTR(str);
7193 const char *s_end = RSTRING_END(str);
7194 rb_encoding *enc = rb_enc_get(str);
7195 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7196 bool utf8 = false;
7197 bool binary = false;
7198 int w;
7199
7201 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7202 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7203 }
7204 if (!str_null_check(str, &w)) {
7205 rb_raise(rb_eRuntimeError, "string contains null byte");
7206 }
7207 if (RSTRING_LEN(str) < 2) goto invalid_format;
7208 if (*s != '"') goto invalid_format;
7209
7210 /* strip '"' at the start */
7211 s++;
7212
7213 for (;;) {
7214 if (s >= s_end) {
7215 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7216 }
7217
7218 if (*s == '"') {
7219 /* epilogue */
7220 s++;
7221 if (s == s_end) {
7222 /* ascii compatible dumped string */
7223 break;
7224 }
7225 else {
7226 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7227 static const char dup_suffix[] = ".dup";
7228 const char *encname;
7229 int encidx;
7230 ptrdiff_t size;
7231
7232 /* check separately for strings dumped by older versions */
7233 size = sizeof(dup_suffix) - 1;
7234 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7235
7236 size = sizeof(force_encoding_suffix) - 1;
7237 if (s_end - s <= size) goto invalid_format;
7238 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7239 s += size;
7240
7241 if (utf8) {
7242 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7243 }
7244
7245 encname = s;
7246 s = memchr(s, '"', s_end-s);
7247 size = s - encname;
7248 if (!s) goto invalid_format;
7249 if (s_end - s != 2) goto invalid_format;
7250 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7251
7252 encidx = rb_enc_find_index2(encname, (long)size);
7253 if (encidx < 0) {
7254 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7255 }
7256 rb_enc_associate_index(undumped, encidx);
7257 }
7258 break;
7259 }
7260
7261 if (*s == '\\') {
7262 s++;
7263 if (s >= s_end) {
7264 rb_raise(rb_eRuntimeError, "invalid escape");
7265 }
7266 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7267 }
7268 else {
7269 rb_str_cat(undumped, s++, 1);
7270 }
7271 }
7272
7273 RB_GC_GUARD(str);
7274
7275 return undumped;
7276invalid_format:
7277 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7278}
7279
7280static void
7281rb_str_check_dummy_enc(rb_encoding *enc)
7282{
7283 if (rb_enc_dummy_p(enc)) {
7284 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7285 rb_enc_name(enc));
7286 }
7287}
7288
7289static rb_encoding *
7290str_true_enc(VALUE str)
7291{
7292 rb_encoding *enc = STR_ENC_GET(str);
7293 rb_str_check_dummy_enc(enc);
7294 return enc;
7295}
7296
7297static OnigCaseFoldType
7298check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7299{
7300 if (argc==0)
7301 return flags;
7302 if (argc>2)
7303 rb_raise(rb_eArgError, "too many options");
7304 if (argv[0]==sym_turkic) {
7305 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7306 if (argc==2) {
7307 if (argv[1]==sym_lithuanian)
7308 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7309 else
7310 rb_raise(rb_eArgError, "invalid second option");
7311 }
7312 }
7313 else if (argv[0]==sym_lithuanian) {
7314 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7315 if (argc==2) {
7316 if (argv[1]==sym_turkic)
7317 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7318 else
7319 rb_raise(rb_eArgError, "invalid second option");
7320 }
7321 }
7322 else if (argc>1)
7323 rb_raise(rb_eArgError, "too many options");
7324 else if (argv[0]==sym_ascii)
7325 flags |= ONIGENC_CASE_ASCII_ONLY;
7326 else if (argv[0]==sym_fold) {
7327 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7328 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7329 else
7330 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7331 }
7332 else
7333 rb_raise(rb_eArgError, "invalid option");
7334 return flags;
7335}
7336
7337static inline bool
7338case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7339{
7340 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7341 return true;
7342 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7343}
7344
7345/* 16 should be long enough to absorb any kind of single character length increase */
7346#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7347#ifndef CASEMAP_DEBUG
7348# define CASEMAP_DEBUG 0
7349#endif
7350
7351struct mapping_buffer;
7352typedef struct mapping_buffer {
7353 size_t capa;
7354 size_t used;
7355 struct mapping_buffer *next;
7356 OnigUChar space[FLEX_ARY_LEN];
7358
7359static void
7360mapping_buffer_free(void *p)
7361{
7362 mapping_buffer *previous_buffer;
7363 mapping_buffer *current_buffer = p;
7364 while (current_buffer) {
7365 previous_buffer = current_buffer;
7366 current_buffer = current_buffer->next;
7367 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7368 }
7369}
7370
7371static const rb_data_type_t mapping_buffer_type = {
7372 "mapping_buffer",
7373 {0, mapping_buffer_free,},
7374 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7375};
7376
7377static VALUE
7378rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7379{
7380 VALUE target;
7381
7382 const OnigUChar *source_current, *source_end;
7383 int target_length = 0;
7384 VALUE buffer_anchor;
7385 mapping_buffer *current_buffer = 0;
7386 mapping_buffer **pre_buffer;
7387 size_t buffer_count = 0;
7388 int buffer_length_or_invalid;
7389
7390 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7391
7392 source_current = (OnigUChar*)RSTRING_PTR(source);
7393 source_end = (OnigUChar*)RSTRING_END(source);
7394
7395 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7396 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7397 while (source_current < source_end) {
7398 /* increase multiplier using buffer count to converge quickly */
7399 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7400 if (CASEMAP_DEBUG) {
7401 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7402 }
7403 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7404 *pre_buffer = current_buffer;
7405 pre_buffer = &current_buffer->next;
7406 current_buffer->next = NULL;
7407 current_buffer->capa = capa;
7408 buffer_length_or_invalid = enc->case_map(flags,
7409 &source_current, source_end,
7410 current_buffer->space,
7411 current_buffer->space+current_buffer->capa,
7412 enc);
7413 if (buffer_length_or_invalid < 0) {
7414 current_buffer = DATA_PTR(buffer_anchor);
7415 DATA_PTR(buffer_anchor) = 0;
7416 mapping_buffer_free(current_buffer);
7417 rb_raise(rb_eArgError, "input string invalid");
7418 }
7419 target_length += current_buffer->used = buffer_length_or_invalid;
7420 }
7421 if (CASEMAP_DEBUG) {
7422 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7423 }
7424
7425 if (buffer_count==1) {
7426 target = rb_str_new((const char*)current_buffer->space, target_length);
7427 }
7428 else {
7429 char *target_current;
7430
7431 target = rb_str_new(0, target_length);
7432 target_current = RSTRING_PTR(target);
7433 current_buffer = DATA_PTR(buffer_anchor);
7434 while (current_buffer) {
7435 memcpy(target_current, current_buffer->space, current_buffer->used);
7436 target_current += current_buffer->used;
7437 current_buffer = current_buffer->next;
7438 }
7439 }
7440 current_buffer = DATA_PTR(buffer_anchor);
7441 DATA_PTR(buffer_anchor) = 0;
7442 mapping_buffer_free(current_buffer);
7443
7444 RB_GC_GUARD(buffer_anchor);
7445
7446 /* TODO: check about string terminator character */
7447 str_enc_copy_direct(target, source);
7448 /*ENC_CODERANGE_SET(mapped, cr);*/
7449
7450 return target;
7451}
7452
7453static VALUE
7454rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7455{
7456 const OnigUChar *source_current, *source_end;
7457 OnigUChar *target_current, *target_end;
7458 long old_length = RSTRING_LEN(source);
7459 int length_or_invalid;
7460
7461 if (old_length == 0) return Qnil;
7462
7463 source_current = (OnigUChar*)RSTRING_PTR(source);
7464 source_end = (OnigUChar*)RSTRING_END(source);
7465 if (source == target) {
7466 target_current = (OnigUChar*)source_current;
7467 target_end = (OnigUChar*)source_end;
7468 }
7469 else {
7470 target_current = (OnigUChar*)RSTRING_PTR(target);
7471 target_end = (OnigUChar*)RSTRING_END(target);
7472 }
7473
7474 length_or_invalid = onigenc_ascii_only_case_map(flags,
7475 &source_current, source_end,
7476 target_current, target_end, enc);
7477 if (length_or_invalid < 0)
7478 rb_raise(rb_eArgError, "input string invalid");
7479 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7480 fprintf(stderr, "problem with rb_str_ascii_casemap"
7481 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7482 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7483 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7484 }
7485
7486 str_enc_copy(target, source);
7487
7488 return target;
7489}
7490
7491static bool
7492upcase_single(VALUE str)
7493{
7494 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7495 bool modified = false;
7496
7497 while (s < send) {
7498 unsigned int c = *(unsigned char*)s;
7499
7500 if ('a' <= c && c <= 'z') {
7501 *s = 'A' + (c - 'a');
7502 modified = true;
7503 }
7504 s++;
7505 }
7506 return modified;
7507}
7508
7509/*
7510 * call-seq:
7511 * upcase!(*options) -> self or nil
7512 *
7513 * Upcases the characters in +self+;
7514 * returns +self+ if any changes were made, +nil+ otherwise:
7515 *
7516 * s = 'Hello World!' # => "Hello World!"
7517 * s.upcase! # => "HELLO WORLD!"
7518 * s # => "HELLO WORLD!"
7519 * s.upcase! # => nil
7520 *
7521 * The casing may be affected by the given +options+;
7522 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7523 *
7524 * Related: String#upcase, String#downcase, String#downcase!.
7525 *
7526 */
7527
7528static VALUE
7529rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7530{
7531 rb_encoding *enc;
7532 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7533
7534 flags = check_case_options(argc, argv, flags);
7535 str_modify_keep_cr(str);
7536 enc = str_true_enc(str);
7537 if (case_option_single_p(flags, enc, str)) {
7538 if (upcase_single(str))
7539 flags |= ONIGENC_CASE_MODIFIED;
7540 }
7541 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7542 rb_str_ascii_casemap(str, str, &flags, enc);
7543 else
7544 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7545
7546 if (ONIGENC_CASE_MODIFIED&flags) return str;
7547 return Qnil;
7548}
7549
7550
7551/*
7552 * call-seq:
7553 * upcase(*options) -> string
7554 *
7555 * Returns a string containing the upcased characters in +self+:
7556 *
7557 * s = 'Hello World!' # => "Hello World!"
7558 * s.upcase # => "HELLO WORLD!"
7559 *
7560 * The casing may be affected by the given +options+;
7561 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7562 *
7563 * Related: String#upcase!, String#downcase, String#downcase!.
7564 *
7565 */
7566
7567static VALUE
7568rb_str_upcase(int argc, VALUE *argv, VALUE str)
7569{
7570 rb_encoding *enc;
7571 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7572 VALUE ret;
7573
7574 flags = check_case_options(argc, argv, flags);
7575 enc = str_true_enc(str);
7576 if (case_option_single_p(flags, enc, str)) {
7577 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7578 str_enc_copy_direct(ret, str);
7579 upcase_single(ret);
7580 }
7581 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7582 ret = rb_str_new(0, RSTRING_LEN(str));
7583 rb_str_ascii_casemap(str, ret, &flags, enc);
7584 }
7585 else {
7586 ret = rb_str_casemap(str, &flags, enc);
7587 }
7588
7589 return ret;
7590}
7591
7592static bool
7593downcase_single(VALUE str)
7594{
7595 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7596 bool modified = false;
7597
7598 while (s < send) {
7599 unsigned int c = *(unsigned char*)s;
7600
7601 if ('A' <= c && c <= 'Z') {
7602 *s = 'a' + (c - 'A');
7603 modified = true;
7604 }
7605 s++;
7606 }
7607
7608 return modified;
7609}
7610
7611/*
7612 * call-seq:
7613 * downcase!(*options) -> self or nil
7614 *
7615 * Downcases the characters in +self+;
7616 * returns +self+ if any changes were made, +nil+ otherwise:
7617 *
7618 * s = 'Hello World!' # => "Hello World!"
7619 * s.downcase! # => "hello world!"
7620 * s # => "hello world!"
7621 * s.downcase! # => nil
7622 *
7623 * The casing may be affected by the given +options+;
7624 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7625 *
7626 * Related: String#downcase, String#upcase, String#upcase!.
7627 *
7628 */
7629
7630static VALUE
7631rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7632{
7633 rb_encoding *enc;
7634 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7635
7636 flags = check_case_options(argc, argv, flags);
7637 str_modify_keep_cr(str);
7638 enc = str_true_enc(str);
7639 if (case_option_single_p(flags, enc, str)) {
7640 if (downcase_single(str))
7641 flags |= ONIGENC_CASE_MODIFIED;
7642 }
7643 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7644 rb_str_ascii_casemap(str, str, &flags, enc);
7645 else
7646 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7647
7648 if (ONIGENC_CASE_MODIFIED&flags) return str;
7649 return Qnil;
7650}
7651
7652
7653/*
7654 * call-seq:
7655 * downcase(*options) -> string
7656 *
7657 * Returns a string containing the downcased characters in +self+:
7658 *
7659 * s = 'Hello World!' # => "Hello World!"
7660 * s.downcase # => "hello world!"
7661 *
7662 * The casing may be affected by the given +options+;
7663 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7664 *
7665 * Related: String#downcase!, String#upcase, String#upcase!.
7666 *
7667 */
7668
7669static VALUE
7670rb_str_downcase(int argc, VALUE *argv, VALUE str)
7671{
7672 rb_encoding *enc;
7673 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7674 VALUE ret;
7675
7676 flags = check_case_options(argc, argv, flags);
7677 enc = str_true_enc(str);
7678 if (case_option_single_p(flags, enc, str)) {
7679 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7680 str_enc_copy_direct(ret, str);
7681 downcase_single(ret);
7682 }
7683 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7684 ret = rb_str_new(0, RSTRING_LEN(str));
7685 rb_str_ascii_casemap(str, ret, &flags, enc);
7686 }
7687 else {
7688 ret = rb_str_casemap(str, &flags, enc);
7689 }
7690
7691 return ret;
7692}
7693
7694
7695/*
7696 * call-seq:
7697 * capitalize!(*options) -> self or nil
7698 *
7699 * Upcases the first character in +self+;
7700 * downcases the remaining characters;
7701 * returns +self+ if any changes were made, +nil+ otherwise:
7702 *
7703 * s = 'hello World!' # => "hello World!"
7704 * s.capitalize! # => "Hello world!"
7705 * s # => "Hello world!"
7706 * s.capitalize! # => nil
7707 *
7708 * The casing may be affected by the given +options+;
7709 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7710 *
7711 * Related: String#capitalize.
7712 *
7713 */
7714
7715static VALUE
7716rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7717{
7718 rb_encoding *enc;
7719 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7720
7721 flags = check_case_options(argc, argv, flags);
7722 str_modify_keep_cr(str);
7723 enc = str_true_enc(str);
7724 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7725 if (flags&ONIGENC_CASE_ASCII_ONLY)
7726 rb_str_ascii_casemap(str, str, &flags, enc);
7727 else
7728 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7729
7730 if (ONIGENC_CASE_MODIFIED&flags) return str;
7731 return Qnil;
7732}
7733
7734
7735/*
7736 * call-seq:
7737 * capitalize(*options) -> string
7738 *
7739 * Returns a string containing the characters in +self+;
7740 * the first character is upcased;
7741 * the remaining characters are downcased:
7742 *
7743 * s = 'hello World!' # => "hello World!"
7744 * s.capitalize # => "Hello world!"
7745 *
7746 * The casing may be affected by the given +options+;
7747 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7748 *
7749 * Related: String#capitalize!.
7750 *
7751 */
7752
7753static VALUE
7754rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7755{
7756 rb_encoding *enc;
7757 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7758 VALUE ret;
7759
7760 flags = check_case_options(argc, argv, flags);
7761 enc = str_true_enc(str);
7762 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7763 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7764 ret = rb_str_new(0, RSTRING_LEN(str));
7765 rb_str_ascii_casemap(str, ret, &flags, enc);
7766 }
7767 else {
7768 ret = rb_str_casemap(str, &flags, enc);
7769 }
7770 return ret;
7771}
7772
7773
7774/*
7775 * call-seq:
7776 * swapcase!(*options) -> self or nil
7777 *
7778 * Upcases each lowercase character in +self+;
7779 * downcases uppercase character;
7780 * returns +self+ if any changes were made, +nil+ otherwise:
7781 *
7782 * s = 'Hello World!' # => "Hello World!"
7783 * s.swapcase! # => "hELLO wORLD!"
7784 * s # => "hELLO wORLD!"
7785 * ''.swapcase! # => nil
7786 *
7787 * The casing may be affected by the given +options+;
7788 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7789 *
7790 * Related: String#swapcase.
7791 *
7792 */
7793
7794static VALUE
7795rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7796{
7797 rb_encoding *enc;
7798 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7799
7800 flags = check_case_options(argc, argv, flags);
7801 str_modify_keep_cr(str);
7802 enc = str_true_enc(str);
7803 if (flags&ONIGENC_CASE_ASCII_ONLY)
7804 rb_str_ascii_casemap(str, str, &flags, enc);
7805 else
7806 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7807
7808 if (ONIGENC_CASE_MODIFIED&flags) return str;
7809 return Qnil;
7810}
7811
7812
7813/*
7814 * call-seq:
7815 * swapcase(*options) -> string
7816 *
7817 * Returns a string containing the characters in +self+, with cases reversed;
7818 * each uppercase character is downcased;
7819 * each lowercase character is upcased:
7820 *
7821 * s = 'Hello World!' # => "Hello World!"
7822 * s.swapcase # => "hELLO wORLD!"
7823 *
7824 * The casing may be affected by the given +options+;
7825 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7826 *
7827 * Related: String#swapcase!.
7828 *
7829 */
7830
7831static VALUE
7832rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7833{
7834 rb_encoding *enc;
7835 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7836 VALUE ret;
7837
7838 flags = check_case_options(argc, argv, flags);
7839 enc = str_true_enc(str);
7840 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7841 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7842 ret = rb_str_new(0, RSTRING_LEN(str));
7843 rb_str_ascii_casemap(str, ret, &flags, enc);
7844 }
7845 else {
7846 ret = rb_str_casemap(str, &flags, enc);
7847 }
7848 return ret;
7849}
7850
7851typedef unsigned char *USTR;
7852
7853struct tr {
7854 int gen;
7855 unsigned int now, max;
7856 char *p, *pend;
7857};
7858
7859static unsigned int
7860trnext(struct tr *t, rb_encoding *enc)
7861{
7862 int n;
7863
7864 for (;;) {
7865 nextpart:
7866 if (!t->gen) {
7867 if (t->p == t->pend) return -1;
7868 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7869 t->p += n;
7870 }
7871 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7872 t->p += n;
7873 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7874 t->p += n;
7875 if (t->p < t->pend) {
7876 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7877 t->p += n;
7878 if (t->now > c) {
7879 if (t->now < 0x80 && c < 0x80) {
7880 rb_raise(rb_eArgError,
7881 "invalid range \"%c-%c\" in string transliteration",
7882 t->now, c);
7883 }
7884 else {
7885 rb_raise(rb_eArgError, "invalid range in string transliteration");
7886 }
7887 continue; /* not reached */
7888 }
7889 else if (t->now < c) {
7890 t->gen = 1;
7891 t->max = c;
7892 }
7893 }
7894 }
7895 return t->now;
7896 }
7897 else {
7898 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7899 if (t->now == t->max) {
7900 t->gen = 0;
7901 goto nextpart;
7902 }
7903 }
7904 if (t->now < t->max) {
7905 return t->now;
7906 }
7907 else {
7908 t->gen = 0;
7909 return t->max;
7910 }
7911 }
7912 }
7913}
7914
7915static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7916
7917static VALUE
7918tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7919{
7920 const unsigned int errc = -1;
7921 unsigned int trans[256];
7922 rb_encoding *enc, *e1, *e2;
7923 struct tr trsrc, trrepl;
7924 int cflag = 0;
7925 unsigned int c, c0, last = 0;
7926 int modify = 0, i, l;
7927 unsigned char *s, *send;
7928 VALUE hash = 0;
7929 int singlebyte = single_byte_optimizable(str);
7930 int termlen;
7931 int cr;
7932
7933#define CHECK_IF_ASCII(c) \
7934 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7935 (cr = ENC_CODERANGE_VALID) : 0)
7936
7937 StringValue(src);
7938 StringValue(repl);
7939 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7940 if (RSTRING_LEN(repl) == 0) {
7941 return rb_str_delete_bang(1, &src, str);
7942 }
7943
7944 cr = ENC_CODERANGE(str);
7945 e1 = rb_enc_check(str, src);
7946 e2 = rb_enc_check(str, repl);
7947 if (e1 == e2) {
7948 enc = e1;
7949 }
7950 else {
7951 enc = rb_enc_check(src, repl);
7952 }
7953 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7954 if (RSTRING_LEN(src) > 1 &&
7955 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7956 trsrc.p + l < trsrc.pend) {
7957 cflag = 1;
7958 trsrc.p += l;
7959 }
7960 trrepl.p = RSTRING_PTR(repl);
7961 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7962 trsrc.gen = trrepl.gen = 0;
7963 trsrc.now = trrepl.now = 0;
7964 trsrc.max = trrepl.max = 0;
7965
7966 if (cflag) {
7967 for (i=0; i<256; i++) {
7968 trans[i] = 1;
7969 }
7970 while ((c = trnext(&trsrc, enc)) != errc) {
7971 if (c < 256) {
7972 trans[c] = errc;
7973 }
7974 else {
7975 if (!hash) hash = rb_hash_new();
7976 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7977 }
7978 }
7979 while ((c = trnext(&trrepl, enc)) != errc)
7980 /* retrieve last replacer */;
7981 last = trrepl.now;
7982 for (i=0; i<256; i++) {
7983 if (trans[i] != errc) {
7984 trans[i] = last;
7985 }
7986 }
7987 }
7988 else {
7989 unsigned int r;
7990
7991 for (i=0; i<256; i++) {
7992 trans[i] = errc;
7993 }
7994 while ((c = trnext(&trsrc, enc)) != errc) {
7995 r = trnext(&trrepl, enc);
7996 if (r == errc) r = trrepl.now;
7997 if (c < 256) {
7998 trans[c] = r;
7999 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
8000 }
8001 else {
8002 if (!hash) hash = rb_hash_new();
8003 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
8004 }
8005 }
8006 }
8007
8008 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
8009 cr = ENC_CODERANGE_7BIT;
8010 str_modify_keep_cr(str);
8011 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
8012 termlen = rb_enc_mbminlen(enc);
8013 if (sflag) {
8014 int clen, tlen;
8015 long offset, max = RSTRING_LEN(str);
8016 unsigned int save = -1;
8017 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8018
8019 while (s < send) {
8020 int may_modify = 0;
8021
8022 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8023 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8024
8025 s += clen;
8026 if (c < 256) {
8027 c = trans[c];
8028 }
8029 else if (hash) {
8030 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8031 if (NIL_P(tmp)) {
8032 if (cflag) c = last;
8033 else c = errc;
8034 }
8035 else if (cflag) c = errc;
8036 else c = NUM2INT(tmp);
8037 }
8038 else {
8039 c = errc;
8040 }
8041 if (c != (unsigned int)-1) {
8042 if (save == c) {
8043 CHECK_IF_ASCII(c);
8044 continue;
8045 }
8046 save = c;
8047 tlen = rb_enc_codelen(c, enc);
8048 modify = 1;
8049 }
8050 else {
8051 save = -1;
8052 c = c0;
8053 if (enc != e1) may_modify = 1;
8054 }
8055 if ((offset = t - buf) + tlen > max) {
8056 size_t MAYBE_UNUSED(old) = max + termlen;
8057 max = offset + tlen + (send - s);
8058 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8059 t = buf + offset;
8060 }
8061 rb_enc_mbcput(c, t, enc);
8062 if (may_modify && memcmp(s, t, tlen) != 0) {
8063 modify = 1;
8064 }
8065 CHECK_IF_ASCII(c);
8066 t += tlen;
8067 }
8068 if (!STR_EMBED_P(str)) {
8069 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8070 }
8071 TERM_FILL((char *)t, termlen);
8072 RSTRING(str)->as.heap.ptr = (char *)buf;
8073 STR_SET_LEN(str, t - buf);
8074 STR_SET_NOEMBED(str);
8075 RSTRING(str)->as.heap.aux.capa = max;
8076 }
8077 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8078 while (s < send) {
8079 c = (unsigned char)*s;
8080 if (trans[c] != errc) {
8081 if (!cflag) {
8082 c = trans[c];
8083 *s = c;
8084 modify = 1;
8085 }
8086 else {
8087 *s = last;
8088 modify = 1;
8089 }
8090 }
8091 CHECK_IF_ASCII(c);
8092 s++;
8093 }
8094 }
8095 else {
8096 int clen, tlen;
8097 long offset, max = (long)((send - s) * 1.2);
8098 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8099
8100 while (s < send) {
8101 int may_modify = 0;
8102 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8103 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8104
8105 if (c < 256) {
8106 c = trans[c];
8107 }
8108 else if (hash) {
8109 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8110 if (NIL_P(tmp)) {
8111 if (cflag) c = last;
8112 else c = errc;
8113 }
8114 else if (cflag) c = errc;
8115 else c = NUM2INT(tmp);
8116 }
8117 else {
8118 c = cflag ? last : errc;
8119 }
8120 if (c != errc) {
8121 tlen = rb_enc_codelen(c, enc);
8122 modify = 1;
8123 }
8124 else {
8125 c = c0;
8126 if (enc != e1) may_modify = 1;
8127 }
8128 if ((offset = t - buf) + tlen > max) {
8129 size_t MAYBE_UNUSED(old) = max + termlen;
8130 max = offset + tlen + (long)((send - s) * 1.2);
8131 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8132 t = buf + offset;
8133 }
8134 if (s != t) {
8135 rb_enc_mbcput(c, t, enc);
8136 if (may_modify && memcmp(s, t, tlen) != 0) {
8137 modify = 1;
8138 }
8139 }
8140 CHECK_IF_ASCII(c);
8141 s += clen;
8142 t += tlen;
8143 }
8144 if (!STR_EMBED_P(str)) {
8145 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8146 }
8147 TERM_FILL((char *)t, termlen);
8148 RSTRING(str)->as.heap.ptr = (char *)buf;
8149 STR_SET_LEN(str, t - buf);
8150 STR_SET_NOEMBED(str);
8151 RSTRING(str)->as.heap.aux.capa = max;
8152 }
8153
8154 if (modify) {
8155 if (cr != ENC_CODERANGE_BROKEN)
8156 ENC_CODERANGE_SET(str, cr);
8157 rb_enc_associate(str, enc);
8158 return str;
8159 }
8160 return Qnil;
8161}
8162
8163
8164/*
8165 * call-seq:
8166 * tr!(selector, replacements) -> self or nil
8167 *
8168 * Like String#tr, but modifies +self+ in place.
8169 * Returns +self+ if any changes were made, +nil+ otherwise.
8170 *
8171 */
8172
8173static VALUE
8174rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8175{
8176 return tr_trans(str, src, repl, 0);
8177}
8178
8179
8180/*
8181 * call-seq:
8182 * tr(selector, replacements) -> new_string
8183 *
8184 * Returns a copy of +self+ with each character specified by string +selector+
8185 * translated to the corresponding character in string +replacements+.
8186 * The correspondence is _positional_:
8187 *
8188 * - Each occurrence of the first character specified by +selector+
8189 * is translated to the first character in +replacements+.
8190 * - Each occurrence of the second character specified by +selector+
8191 * is translated to the second character in +replacements+.
8192 * - And so on.
8193 *
8194 * Example:
8195 *
8196 * 'hello'.tr('el', 'ip') #=> "hippo"
8197 *
8198 * If +replacements+ is shorter than +selector+,
8199 * it is implicitly padded with its own last character:
8200 *
8201 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8202 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8203 *
8204 * Arguments +selector+ and +replacements+ must be valid character selectors
8205 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8206 * and may use any of its valid forms, including negation, ranges, and escaping:
8207 *
8208 * # Negation.
8209 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8210 * # Ranges.
8211 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8212 * # Escapes.
8213 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8214 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8215 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8216 *
8217 */
8218
8219static VALUE
8220rb_str_tr(VALUE str, VALUE src, VALUE repl)
8221{
8222 str = str_duplicate(rb_cString, str);
8223 tr_trans(str, src, repl, 0);
8224 return str;
8225}
8226
8227#define TR_TABLE_MAX (UCHAR_MAX+1)
8228#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8229static void
8230tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8231 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8232{
8233 const unsigned int errc = -1;
8234 char buf[TR_TABLE_MAX];
8235 struct tr tr;
8236 unsigned int c;
8237 VALUE table = 0, ptable = 0;
8238 int i, l, cflag = 0;
8239
8240 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8241 tr.gen = tr.now = tr.max = 0;
8242
8243 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8244 cflag = 1;
8245 tr.p += l;
8246 }
8247 if (first) {
8248 for (i=0; i<TR_TABLE_MAX; i++) {
8249 stable[i] = 1;
8250 }
8251 stable[TR_TABLE_MAX] = cflag;
8252 }
8253 else if (stable[TR_TABLE_MAX] && !cflag) {
8254 stable[TR_TABLE_MAX] = 0;
8255 }
8256 for (i=0; i<TR_TABLE_MAX; i++) {
8257 buf[i] = cflag;
8258 }
8259
8260 while ((c = trnext(&tr, enc)) != errc) {
8261 if (c < TR_TABLE_MAX) {
8262 buf[(unsigned char)c] = !cflag;
8263 }
8264 else {
8265 VALUE key = UINT2NUM(c);
8266
8267 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8268 if (cflag) {
8269 ptable = *ctablep;
8270 table = ptable ? ptable : rb_hash_new();
8271 *ctablep = table;
8272 }
8273 else {
8274 table = rb_hash_new();
8275 ptable = *tablep;
8276 *tablep = table;
8277 }
8278 }
8279 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8280 rb_hash_aset(table, key, Qtrue);
8281 }
8282 }
8283 }
8284 for (i=0; i<TR_TABLE_MAX; i++) {
8285 stable[i] = stable[i] && buf[i];
8286 }
8287 if (!table && !cflag) {
8288 *tablep = 0;
8289 }
8290}
8291
8292
8293static int
8294tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8295{
8296 if (c < TR_TABLE_MAX) {
8297 return table[c] != 0;
8298 }
8299 else {
8300 VALUE v = UINT2NUM(c);
8301
8302 if (del) {
8303 if (!NIL_P(rb_hash_lookup(del, v)) &&
8304 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8305 return TRUE;
8306 }
8307 }
8308 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8309 return FALSE;
8310 }
8311 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8312 }
8313}
8314
8315/*
8316 * call-seq:
8317 * delete!(*selectors) -> self or nil
8318 *
8319 * Like String#delete, but modifies +self+ in place.
8320 * Returns +self+ if any changes were made, +nil+ otherwise.
8321 *
8322 */
8323
8324static VALUE
8325rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8326{
8327 char squeez[TR_TABLE_SIZE];
8328 rb_encoding *enc = 0;
8329 char *s, *send, *t;
8330 VALUE del = 0, nodel = 0;
8331 int modify = 0;
8332 int i, ascompat, cr;
8333
8334 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8336 for (i=0; i<argc; i++) {
8337 VALUE s = argv[i];
8338
8339 StringValue(s);
8340 enc = rb_enc_check(str, s);
8341 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8342 }
8343
8344 str_modify_keep_cr(str);
8345 ascompat = rb_enc_asciicompat(enc);
8346 s = t = RSTRING_PTR(str);
8347 send = RSTRING_END(str);
8348 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8349 while (s < send) {
8350 unsigned int c;
8351 int clen;
8352
8353 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8354 if (squeez[c]) {
8355 modify = 1;
8356 }
8357 else {
8358 if (t != s) *t = c;
8359 t++;
8360 }
8361 s++;
8362 }
8363 else {
8364 c = rb_enc_codepoint_len(s, send, &clen, enc);
8365
8366 if (tr_find(c, squeez, del, nodel)) {
8367 modify = 1;
8368 }
8369 else {
8370 if (t != s) rb_enc_mbcput(c, t, enc);
8371 t += clen;
8373 }
8374 s += clen;
8375 }
8376 }
8377 TERM_FILL(t, TERM_LEN(str));
8378 STR_SET_LEN(str, t - RSTRING_PTR(str));
8379 ENC_CODERANGE_SET(str, cr);
8380
8381 if (modify) return str;
8382 return Qnil;
8383}
8384
8385
8386/*
8387 * call-seq:
8388 * delete(*selectors) -> new_string
8389 *
8390 * Returns a copy of +self+ with characters specified by +selectors+ removed
8391 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8392 *
8393 * "hello".delete "l","lo" #=> "heo"
8394 * "hello".delete "lo" #=> "he"
8395 * "hello".delete "aeiou", "^e" #=> "hell"
8396 * "hello".delete "ej-m" #=> "ho"
8397 *
8398 */
8399
8400static VALUE
8401rb_str_delete(int argc, VALUE *argv, VALUE str)
8402{
8403 str = str_duplicate(rb_cString, str);
8404 rb_str_delete_bang(argc, argv, str);
8405 return str;
8406}
8407
8408
8409/*
8410 * call-seq:
8411 * squeeze!(*selectors) -> self or nil
8412 *
8413 * Like String#squeeze, but modifies +self+ in place.
8414 * Returns +self+ if any changes were made, +nil+ otherwise.
8415 */
8416
8417static VALUE
8418rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8419{
8420 char squeez[TR_TABLE_SIZE];
8421 rb_encoding *enc = 0;
8422 VALUE del = 0, nodel = 0;
8423 unsigned char *s, *send, *t;
8424 int i, modify = 0;
8425 int ascompat, singlebyte = single_byte_optimizable(str);
8426 unsigned int save;
8427
8428 if (argc == 0) {
8429 enc = STR_ENC_GET(str);
8430 }
8431 else {
8432 for (i=0; i<argc; i++) {
8433 VALUE s = argv[i];
8434
8435 StringValue(s);
8436 enc = rb_enc_check(str, s);
8437 if (singlebyte && !single_byte_optimizable(s))
8438 singlebyte = 0;
8439 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8440 }
8441 }
8442
8443 str_modify_keep_cr(str);
8444 s = t = (unsigned char *)RSTRING_PTR(str);
8445 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8446 send = (unsigned char *)RSTRING_END(str);
8447 save = -1;
8448 ascompat = rb_enc_asciicompat(enc);
8449
8450 if (singlebyte) {
8451 while (s < send) {
8452 unsigned int c = *s++;
8453 if (c != save || (argc > 0 && !squeez[c])) {
8454 *t++ = save = c;
8455 }
8456 }
8457 }
8458 else {
8459 while (s < send) {
8460 unsigned int c;
8461 int clen;
8462
8463 if (ascompat && (c = *s) < 0x80) {
8464 if (c != save || (argc > 0 && !squeez[c])) {
8465 *t++ = save = c;
8466 }
8467 s++;
8468 }
8469 else {
8470 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8471
8472 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8473 if (t != s) rb_enc_mbcput(c, t, enc);
8474 save = c;
8475 t += clen;
8476 }
8477 s += clen;
8478 }
8479 }
8480 }
8481
8482 TERM_FILL((char *)t, TERM_LEN(str));
8483 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8484 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8485 modify = 1;
8486 }
8487
8488 if (modify) return str;
8489 return Qnil;
8490}
8491
8492
8493/*
8494 * call-seq:
8495 * squeeze(*selectors) -> new_string
8496 *
8497 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8498 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8499 *
8500 * "Squeezed" means that each multiple-character run of a selected character
8501 * is squeezed down to a single character;
8502 * with no arguments given, squeezes all characters:
8503 *
8504 * "yellow moon".squeeze #=> "yelow mon"
8505 * " now is the".squeeze(" ") #=> " now is the"
8506 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8507 *
8508 */
8509
8510static VALUE
8511rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8512{
8513 str = str_duplicate(rb_cString, str);
8514 rb_str_squeeze_bang(argc, argv, str);
8515 return str;
8516}
8517
8518
8519/*
8520 * call-seq:
8521 * tr_s!(selector, replacements) -> self or nil
8522 *
8523 * Like String#tr_s, but modifies +self+ in place.
8524 * Returns +self+ if any changes were made, +nil+ otherwise.
8525 *
8526 * Related: String#squeeze!.
8527 */
8528
8529static VALUE
8530rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8531{
8532 return tr_trans(str, src, repl, 1);
8533}
8534
8535
8536/*
8537 * call-seq:
8538 * tr_s(selector, replacements) -> string
8539 *
8540 * Like String#tr, but also squeezes the modified portions of the translated string;
8541 * returns a new string (translated and squeezed).
8542 *
8543 * 'hello'.tr_s('l', 'r') #=> "hero"
8544 * 'hello'.tr_s('el', '-') #=> "h-o"
8545 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8546 *
8547 * Related: String#squeeze.
8548 *
8549 */
8550
8551static VALUE
8552rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8553{
8554 str = str_duplicate(rb_cString, str);
8555 tr_trans(str, src, repl, 1);
8556 return str;
8557}
8558
8559
8560/*
8561 * call-seq:
8562 * count(*selectors) -> integer
8563 *
8564 * Returns the total number of characters in +self+
8565 * that are specified by the given +selectors+
8566 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8567 *
8568 * a = "hello world"
8569 * a.count "lo" #=> 5
8570 * a.count "lo", "o" #=> 2
8571 * a.count "hello", "^l" #=> 4
8572 * a.count "ej-m" #=> 4
8573 *
8574 * "hello^world".count "\\^aeiou" #=> 4
8575 * "hello-world".count "a\\-eo" #=> 4
8576 *
8577 * c = "hello world\\r\\n"
8578 * c.count "\\" #=> 2
8579 * c.count "\\A" #=> 0
8580 * c.count "X-\\w" #=> 3
8581 */
8582
8583static VALUE
8584rb_str_count(int argc, VALUE *argv, VALUE str)
8585{
8586 char table[TR_TABLE_SIZE];
8587 rb_encoding *enc = 0;
8588 VALUE del = 0, nodel = 0, tstr;
8589 char *s, *send;
8590 int i;
8591 int ascompat;
8592 size_t n = 0;
8593
8595
8596 tstr = argv[0];
8597 StringValue(tstr);
8598 enc = rb_enc_check(str, tstr);
8599 if (argc == 1) {
8600 const char *ptstr;
8601 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8602 (ptstr = RSTRING_PTR(tstr),
8603 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8604 !is_broken_string(str)) {
8605 int clen;
8606 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8607
8608 s = RSTRING_PTR(str);
8609 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8610 send = RSTRING_END(str);
8611 while (s < send) {
8612 if (*(unsigned char*)s++ == c) n++;
8613 }
8614 return SIZET2NUM(n);
8615 }
8616 }
8617
8618 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8619 for (i=1; i<argc; i++) {
8620 tstr = argv[i];
8621 StringValue(tstr);
8622 enc = rb_enc_check(str, tstr);
8623 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8624 }
8625
8626 s = RSTRING_PTR(str);
8627 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8628 send = RSTRING_END(str);
8629 ascompat = rb_enc_asciicompat(enc);
8630 while (s < send) {
8631 unsigned int c;
8632
8633 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8634 if (table[c]) {
8635 n++;
8636 }
8637 s++;
8638 }
8639 else {
8640 int clen;
8641 c = rb_enc_codepoint_len(s, send, &clen, enc);
8642 if (tr_find(c, table, del, nodel)) {
8643 n++;
8644 }
8645 s += clen;
8646 }
8647 }
8648
8649 return SIZET2NUM(n);
8650}
8651
8652static VALUE
8653rb_fs_check(VALUE val)
8654{
8655 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8656 val = rb_check_string_type(val);
8657 if (NIL_P(val)) return 0;
8658 }
8659 return val;
8660}
8661
8662static const char isspacetable[256] = {
8663 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8664 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8665 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8666 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8667 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8668 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8669 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8670 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8671 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8672 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8673 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8674 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8675 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8676 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8677 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8678 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8679};
8680
8681#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8682
8683static long
8684split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8685{
8686 if (empty_count >= 0 && len == 0) {
8687 return empty_count + 1;
8688 }
8689 if (empty_count > 0) {
8690 /* make different substrings */
8691 if (result) {
8692 do {
8693 rb_ary_push(result, str_new_empty_String(str));
8694 } while (--empty_count > 0);
8695 }
8696 else {
8697 do {
8698 rb_yield(str_new_empty_String(str));
8699 } while (--empty_count > 0);
8700 }
8701 }
8702 str = rb_str_subseq(str, beg, len);
8703 if (result) {
8704 rb_ary_push(result, str);
8705 }
8706 else {
8707 rb_yield(str);
8708 }
8709 return empty_count;
8710}
8711
8712typedef enum {
8713 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8714} split_type_t;
8715
8716static split_type_t
8717literal_split_pattern(VALUE spat, split_type_t default_type)
8718{
8719 rb_encoding *enc = STR_ENC_GET(spat);
8720 const char *ptr;
8721 long len;
8722 RSTRING_GETMEM(spat, ptr, len);
8723 if (len == 0) {
8724 /* Special case - split into chars */
8725 return SPLIT_TYPE_CHARS;
8726 }
8727 else if (rb_enc_asciicompat(enc)) {
8728 if (len == 1 && ptr[0] == ' ') {
8729 return SPLIT_TYPE_AWK;
8730 }
8731 }
8732 else {
8733 int l;
8734 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8735 return SPLIT_TYPE_AWK;
8736 }
8737 }
8738 return default_type;
8739}
8740
8741/*
8742 * call-seq:
8743 * split(field_sep = $;, limit = nil) -> array
8744 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8745 *
8746 * :include: doc/string/split.rdoc
8747 *
8748 */
8749
8750static VALUE
8751rb_str_split_m(int argc, VALUE *argv, VALUE str)
8752{
8753 rb_encoding *enc;
8754 VALUE spat;
8755 VALUE limit;
8756 split_type_t split_type;
8757 long beg, end, i = 0, empty_count = -1;
8758 int lim = 0;
8759 VALUE result, tmp;
8760
8761 result = rb_block_given_p() ? Qfalse : Qnil;
8762 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8763 lim = NUM2INT(limit);
8764 if (lim <= 0) limit = Qnil;
8765 else if (lim == 1) {
8766 if (RSTRING_LEN(str) == 0)
8767 return result ? rb_ary_new2(0) : str;
8768 tmp = str_duplicate(rb_cString, str);
8769 if (!result) {
8770 rb_yield(tmp);
8771 return str;
8772 }
8773 return rb_ary_new3(1, tmp);
8774 }
8775 i = 1;
8776 }
8777 if (NIL_P(limit) && !lim) empty_count = 0;
8778
8779 enc = STR_ENC_GET(str);
8780 split_type = SPLIT_TYPE_REGEXP;
8781 if (!NIL_P(spat)) {
8782 spat = get_pat_quoted(spat, 0);
8783 }
8784 else if (NIL_P(spat = rb_fs)) {
8785 split_type = SPLIT_TYPE_AWK;
8786 }
8787 else if (!(spat = rb_fs_check(spat))) {
8788 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8789 }
8790 else {
8791 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8792 }
8793 if (split_type != SPLIT_TYPE_AWK) {
8794 switch (BUILTIN_TYPE(spat)) {
8795 case T_REGEXP:
8796 rb_reg_options(spat); /* check if uninitialized */
8797 tmp = RREGEXP_SRC(spat);
8798 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8799 if (split_type == SPLIT_TYPE_AWK) {
8800 spat = tmp;
8801 split_type = SPLIT_TYPE_STRING;
8802 }
8803 break;
8804
8805 case T_STRING:
8806 mustnot_broken(spat);
8807 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8808 break;
8809
8810 default:
8812 }
8813 }
8814
8815#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8816
8817 beg = 0;
8818 char *ptr = RSTRING_PTR(str);
8819 char *eptr = RSTRING_END(str);
8820 if (split_type == SPLIT_TYPE_AWK) {
8821 char *bptr = ptr;
8822 int skip = 1;
8823 unsigned int c;
8824
8825 if (result) result = rb_ary_new();
8826 end = beg;
8827 if (is_ascii_string(str)) {
8828 while (ptr < eptr) {
8829 c = (unsigned char)*ptr++;
8830 if (skip) {
8831 if (ascii_isspace(c)) {
8832 beg = ptr - bptr;
8833 }
8834 else {
8835 end = ptr - bptr;
8836 skip = 0;
8837 if (!NIL_P(limit) && lim <= i) break;
8838 }
8839 }
8840 else if (ascii_isspace(c)) {
8841 SPLIT_STR(beg, end-beg);
8842 skip = 1;
8843 beg = ptr - bptr;
8844 if (!NIL_P(limit)) ++i;
8845 }
8846 else {
8847 end = ptr - bptr;
8848 }
8849 }
8850 }
8851 else {
8852 while (ptr < eptr) {
8853 int n;
8854
8855 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8856 ptr += n;
8857 if (skip) {
8858 if (rb_isspace(c)) {
8859 beg = ptr - bptr;
8860 }
8861 else {
8862 end = ptr - bptr;
8863 skip = 0;
8864 if (!NIL_P(limit) && lim <= i) break;
8865 }
8866 }
8867 else if (rb_isspace(c)) {
8868 SPLIT_STR(beg, end-beg);
8869 skip = 1;
8870 beg = ptr - bptr;
8871 if (!NIL_P(limit)) ++i;
8872 }
8873 else {
8874 end = ptr - bptr;
8875 }
8876 }
8877 }
8878 }
8879 else if (split_type == SPLIT_TYPE_STRING) {
8880 char *str_start = ptr;
8881 char *substr_start = ptr;
8882 char *sptr = RSTRING_PTR(spat);
8883 long slen = RSTRING_LEN(spat);
8884
8885 if (result) result = rb_ary_new();
8886 mustnot_broken(str);
8887 enc = rb_enc_check(str, spat);
8888 while (ptr < eptr &&
8889 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8890 /* Check we are at the start of a char */
8891 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8892 if (t != ptr + end) {
8893 ptr = t;
8894 continue;
8895 }
8896 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8897 ptr += end + slen;
8898 substr_start = ptr;
8899 if (!NIL_P(limit) && lim <= ++i) break;
8900 }
8901 beg = ptr - str_start;
8902 }
8903 else if (split_type == SPLIT_TYPE_CHARS) {
8904 char *str_start = ptr;
8905 int n;
8906
8907 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8908 mustnot_broken(str);
8909 enc = rb_enc_get(str);
8910 while (ptr < eptr &&
8911 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8912 SPLIT_STR(ptr - str_start, n);
8913 ptr += n;
8914 if (!NIL_P(limit) && lim <= ++i) break;
8915 }
8916 beg = ptr - str_start;
8917 }
8918 else {
8919 if (result) result = rb_ary_new();
8920 long len = RSTRING_LEN(str);
8921 long start = beg;
8922 long idx;
8923 int last_null = 0;
8924 struct re_registers *regs;
8925 VALUE match = 0;
8926
8927 for (; rb_reg_search(spat, str, start, 0) >= 0;
8928 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8929 match = rb_backref_get();
8930 if (!result) rb_match_busy(match);
8931 regs = RMATCH_REGS(match);
8932 end = BEG(0);
8933 if (start == end && BEG(0) == END(0)) {
8934 if (!ptr) {
8935 SPLIT_STR(0, 0);
8936 break;
8937 }
8938 else if (last_null == 1) {
8939 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8940 beg = start;
8941 }
8942 else {
8943 if (start == len)
8944 start++;
8945 else
8946 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8947 last_null = 1;
8948 continue;
8949 }
8950 }
8951 else {
8952 SPLIT_STR(beg, end-beg);
8953 beg = start = END(0);
8954 }
8955 last_null = 0;
8956
8957 for (idx=1; idx < regs->num_regs; idx++) {
8958 if (BEG(idx) == -1) continue;
8959 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8960 }
8961 if (!NIL_P(limit) && lim <= ++i) break;
8962 }
8963 if (match) rb_match_unbusy(match);
8964 }
8965 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8966 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8967 }
8968
8969 return result ? result : str;
8970}
8971
8972VALUE
8973rb_str_split(VALUE str, const char *sep0)
8974{
8975 VALUE sep;
8976
8977 StringValue(str);
8978 sep = rb_str_new_cstr(sep0);
8979 return rb_str_split_m(1, &sep, str);
8980}
8981
8982#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8983
8984static inline int
8985enumerator_element(VALUE ary, VALUE e)
8986{
8987 if (ary) {
8988 rb_ary_push(ary, e);
8989 return 0;
8990 }
8991 else {
8992 rb_yield(e);
8993 return 1;
8994 }
8995}
8996
8997#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8998
8999static const char *
9000chomp_newline(const char *p, const char *e, rb_encoding *enc)
9001{
9002 const char *prev = rb_enc_prev_char(p, e, e, enc);
9003 if (rb_enc_is_newline(prev, e, enc)) {
9004 e = prev;
9005 prev = rb_enc_prev_char(p, e, e, enc);
9006 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
9007 e = prev;
9008 }
9009 return e;
9010}
9011
9012static VALUE
9013get_rs(void)
9014{
9015 VALUE rs = rb_rs;
9016 if (!NIL_P(rs) &&
9017 (!RB_TYPE_P(rs, T_STRING) ||
9018 RSTRING_LEN(rs) != 1 ||
9019 RSTRING_PTR(rs)[0] != '\n')) {
9020 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
9021 }
9022 return rs;
9023}
9024
9025#define rb_rs get_rs()
9026
9027static VALUE
9028rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
9029{
9030 rb_encoding *enc;
9031 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
9032 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
9033 long pos, len, rslen;
9034 int rsnewline = 0;
9035
9036 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9037 rs = rb_rs;
9038 if (!NIL_P(opts)) {
9039 static ID keywords[1];
9040 if (!keywords[0]) {
9041 keywords[0] = rb_intern_const("chomp");
9042 }
9043 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9044 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9045 }
9046
9047 if (NIL_P(rs)) {
9048 if (!ENUM_ELEM(ary, str)) {
9049 return ary;
9050 }
9051 else {
9052 return orig;
9053 }
9054 }
9055
9056 if (!RSTRING_LEN(str)) goto end;
9057 str = rb_str_new_frozen(str);
9058 ptr = subptr = RSTRING_PTR(str);
9059 pend = RSTRING_END(str);
9060 len = RSTRING_LEN(str);
9061 StringValue(rs);
9062 rslen = RSTRING_LEN(rs);
9063
9064 if (rs == rb_default_rs)
9065 enc = rb_enc_get(str);
9066 else
9067 enc = rb_enc_check(str, rs);
9068
9069 if (rslen == 0) {
9070 /* paragraph mode */
9071 int n;
9072 const char *eol = NULL;
9073 subend = subptr;
9074 while (subend < pend) {
9075 long chomp_rslen = 0;
9076 do {
9077 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9078 n = 0;
9079 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9080 if (rb_enc_is_newline(subend + n, pend, enc)) {
9081 if (eol == subend) break;
9082 subend += rslen;
9083 if (subptr) {
9084 eol = subend;
9085 chomp_rslen = -rslen;
9086 }
9087 }
9088 else {
9089 if (!subptr) subptr = subend;
9090 subend += rslen;
9091 }
9092 rslen = 0;
9093 } while (subend < pend);
9094 if (!subptr) break;
9095 if (rslen == 0) chomp_rslen = 0;
9096 line = rb_str_subseq(str, subptr - ptr,
9097 subend - subptr + (chomp ? chomp_rslen : rslen));
9098 if (ENUM_ELEM(ary, line)) {
9099 str_mod_check(str, ptr, len);
9100 }
9101 subptr = eol = NULL;
9102 }
9103 goto end;
9104 }
9105 else {
9106 rsptr = RSTRING_PTR(rs);
9107 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9108 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9109 rsnewline = 1;
9110 }
9111 }
9112
9113 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9114 rs = rb_str_new(rsptr, rslen);
9115 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9116 rsptr = RSTRING_PTR(rs);
9117 rslen = RSTRING_LEN(rs);
9118 }
9119
9120 while (subptr < pend) {
9121 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9122 if (pos < 0) break;
9123 hit = subptr + pos;
9124 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9125 if (hit != adjusted) {
9126 subptr = adjusted;
9127 continue;
9128 }
9129 subend = hit += rslen;
9130 if (chomp) {
9131 if (rsnewline) {
9132 subend = chomp_newline(subptr, subend, enc);
9133 }
9134 else {
9135 subend -= rslen;
9136 }
9137 }
9138 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9139 if (ENUM_ELEM(ary, line)) {
9140 str_mod_check(str, ptr, len);
9141 }
9142 subptr = hit;
9143 }
9144
9145 if (subptr != pend) {
9146 if (chomp) {
9147 if (rsnewline) {
9148 pend = chomp_newline(subptr, pend, enc);
9149 }
9150 else if (pend - subptr >= rslen &&
9151 memcmp(pend - rslen, rsptr, rslen) == 0) {
9152 pend -= rslen;
9153 }
9154 }
9155 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9156 ENUM_ELEM(ary, line);
9157 RB_GC_GUARD(str);
9158 }
9159
9160 end:
9161 if (ary)
9162 return ary;
9163 else
9164 return orig;
9165}
9166
9167/*
9168 * call-seq:
9169 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9170 * each_line(line_sep = $/, chomp: false) -> enumerator
9171 *
9172 * :include: doc/string/each_line.rdoc
9173 *
9174 */
9175
9176static VALUE
9177rb_str_each_line(int argc, VALUE *argv, VALUE str)
9178{
9179 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9180 return rb_str_enumerate_lines(argc, argv, str, 0);
9181}
9182
9183/*
9184 * call-seq:
9185 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9186 *
9187 * Forms substrings ("lines") of +self+ according to the given arguments
9188 * (see String#each_line for details); returns the lines in an array.
9189 *
9190 */
9191
9192static VALUE
9193rb_str_lines(int argc, VALUE *argv, VALUE str)
9194{
9195 VALUE ary = WANTARRAY("lines", 0);
9196 return rb_str_enumerate_lines(argc, argv, str, ary);
9197}
9198
9199static VALUE
9200rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9201{
9202 return LONG2FIX(RSTRING_LEN(str));
9203}
9204
9205static VALUE
9206rb_str_enumerate_bytes(VALUE str, VALUE ary)
9207{
9208 long i;
9209
9210 for (i=0; i<RSTRING_LEN(str); i++) {
9211 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9212 }
9213 if (ary)
9214 return ary;
9215 else
9216 return str;
9217}
9218
9219/*
9220 * call-seq:
9221 * each_byte {|byte| ... } -> self
9222 * each_byte -> enumerator
9223 *
9224 * :include: doc/string/each_byte.rdoc
9225 *
9226 */
9227
9228static VALUE
9229rb_str_each_byte(VALUE str)
9230{
9231 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9232 return rb_str_enumerate_bytes(str, 0);
9233}
9234
9235/*
9236 * call-seq:
9237 * bytes -> array_of_bytes
9238 *
9239 * :include: doc/string/bytes.rdoc
9240 *
9241 */
9242
9243static VALUE
9244rb_str_bytes(VALUE str)
9245{
9246 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9247 return rb_str_enumerate_bytes(str, ary);
9248}
9249
9250static VALUE
9251rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9252{
9253 return rb_str_length(str);
9254}
9255
9256static VALUE
9257rb_str_enumerate_chars(VALUE str, VALUE ary)
9258{
9259 VALUE orig = str;
9260 long i, len, n;
9261 const char *ptr;
9262 rb_encoding *enc;
9263
9264 str = rb_str_new_frozen(str);
9265 ptr = RSTRING_PTR(str);
9266 len = RSTRING_LEN(str);
9267 enc = rb_enc_get(str);
9268
9270 for (i = 0; i < len; i += n) {
9271 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9272 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9273 }
9274 }
9275 else {
9276 for (i = 0; i < len; i += n) {
9277 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9278 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9279 }
9280 }
9281 RB_GC_GUARD(str);
9282 if (ary)
9283 return ary;
9284 else
9285 return orig;
9286}
9287
9288/*
9289 * call-seq:
9290 * each_char {|c| ... } -> self
9291 * each_char -> enumerator
9292 *
9293 * :include: doc/string/each_char.rdoc
9294 *
9295 */
9296
9297static VALUE
9298rb_str_each_char(VALUE str)
9299{
9300 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9301 return rb_str_enumerate_chars(str, 0);
9302}
9303
9304/*
9305 * call-seq:
9306 * chars -> array_of_characters
9307 *
9308 * :include: doc/string/chars.rdoc
9309 *
9310 */
9311
9312static VALUE
9313rb_str_chars(VALUE str)
9314{
9315 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9316 return rb_str_enumerate_chars(str, ary);
9317}
9318
9319static VALUE
9320rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9321{
9322 VALUE orig = str;
9323 int n;
9324 unsigned int c;
9325 const char *ptr, *end;
9326 rb_encoding *enc;
9327
9328 if (single_byte_optimizable(str))
9329 return rb_str_enumerate_bytes(str, ary);
9330
9331 str = rb_str_new_frozen(str);
9332 ptr = RSTRING_PTR(str);
9333 end = RSTRING_END(str);
9334 enc = STR_ENC_GET(str);
9335
9336 while (ptr < end) {
9337 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9338 ENUM_ELEM(ary, UINT2NUM(c));
9339 ptr += n;
9340 }
9341 RB_GC_GUARD(str);
9342 if (ary)
9343 return ary;
9344 else
9345 return orig;
9346}
9347
9348/*
9349 * call-seq:
9350 * each_codepoint {|integer| ... } -> self
9351 * each_codepoint -> enumerator
9352 *
9353 * :include: doc/string/each_codepoint.rdoc
9354 *
9355 */
9356
9357static VALUE
9358rb_str_each_codepoint(VALUE str)
9359{
9360 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9361 return rb_str_enumerate_codepoints(str, 0);
9362}
9363
9364/*
9365 * call-seq:
9366 * codepoints -> array_of_integers
9367 *
9368 * :include: doc/string/codepoints.rdoc
9369 *
9370 */
9371
9372static VALUE
9373rb_str_codepoints(VALUE str)
9374{
9375 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9376 return rb_str_enumerate_codepoints(str, ary);
9377}
9378
9379static regex_t *
9380get_reg_grapheme_cluster(rb_encoding *enc)
9381{
9382 int encidx = rb_enc_to_index(enc);
9383
9384 const OnigUChar source_ascii[] = "\\X";
9385 const OnigUChar *source = source_ascii;
9386 size_t source_len = sizeof(source_ascii) - 1;
9387
9388 switch (encidx) {
9389#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9390#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9391#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9392#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9393#define CASE_UTF(e) \
9394 case ENCINDEX_UTF_##e: { \
9395 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9396 source = source_UTF_##e; \
9397 source_len = sizeof(source_UTF_##e); \
9398 break; \
9399 }
9400 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9401#undef CASE_UTF
9402#undef CHARS_16BE
9403#undef CHARS_16LE
9404#undef CHARS_32BE
9405#undef CHARS_32LE
9406 }
9407
9408 regex_t *reg_grapheme_cluster;
9409 OnigErrorInfo einfo;
9410 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9411 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9412 if (r) {
9413 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9414 onig_error_code_to_str(message, r, &einfo);
9415 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9416 }
9417
9418 return reg_grapheme_cluster;
9419}
9420
9421static regex_t *
9422get_cached_reg_grapheme_cluster(rb_encoding *enc)
9423{
9424 int encidx = rb_enc_to_index(enc);
9425 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9426
9427 if (encidx == rb_utf8_encindex()) {
9428 if (!reg_grapheme_cluster_utf8) {
9429 reg_grapheme_cluster_utf8 = get_reg_grapheme_cluster(enc);
9430 }
9431
9432 return reg_grapheme_cluster_utf8;
9433 }
9434
9435 return NULL;
9436}
9437
9438static VALUE
9439rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9440{
9441 size_t grapheme_cluster_count = 0;
9442 rb_encoding *enc = get_encoding(str);
9443 const char *ptr, *end;
9444
9445 if (!rb_enc_unicode_p(enc)) {
9446 return rb_str_length(str);
9447 }
9448
9449 bool cached_reg_grapheme_cluster = true;
9450 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9451 if (!reg_grapheme_cluster) {
9452 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9453 cached_reg_grapheme_cluster = false;
9454 }
9455
9456 ptr = RSTRING_PTR(str);
9457 end = RSTRING_END(str);
9458
9459 while (ptr < end) {
9460 OnigPosition len = onig_match(reg_grapheme_cluster,
9461 (const OnigUChar *)ptr, (const OnigUChar *)end,
9462 (const OnigUChar *)ptr, NULL, 0);
9463 if (len <= 0) break;
9464 grapheme_cluster_count++;
9465 ptr += len;
9466 }
9467
9468 if (!cached_reg_grapheme_cluster) {
9469 onig_free(reg_grapheme_cluster);
9470 }
9471
9472 return SIZET2NUM(grapheme_cluster_count);
9473}
9474
9475static VALUE
9476rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9477{
9478 VALUE orig = str;
9479 rb_encoding *enc = get_encoding(str);
9480 const char *ptr0, *ptr, *end;
9481
9482 if (!rb_enc_unicode_p(enc)) {
9483 return rb_str_enumerate_chars(str, ary);
9484 }
9485
9486 if (!ary) str = rb_str_new_frozen(str);
9487
9488 bool cached_reg_grapheme_cluster = true;
9489 regex_t *reg_grapheme_cluster = get_cached_reg_grapheme_cluster(enc);
9490 if (!reg_grapheme_cluster) {
9491 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9492 cached_reg_grapheme_cluster = false;
9493 }
9494
9495 ptr0 = ptr = RSTRING_PTR(str);
9496 end = RSTRING_END(str);
9497
9498 while (ptr < end) {
9499 OnigPosition len = onig_match(reg_grapheme_cluster,
9500 (const OnigUChar *)ptr, (const OnigUChar *)end,
9501 (const OnigUChar *)ptr, NULL, 0);
9502 if (len <= 0) break;
9503 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9504 ptr += len;
9505 }
9506
9507 if (!cached_reg_grapheme_cluster) {
9508 onig_free(reg_grapheme_cluster);
9509 }
9510
9511 RB_GC_GUARD(str);
9512 if (ary)
9513 return ary;
9514 else
9515 return orig;
9516}
9517
9518/*
9519 * call-seq:
9520 * each_grapheme_cluster {|gc| ... } -> self
9521 * each_grapheme_cluster -> enumerator
9522 *
9523 * :include: doc/string/each_grapheme_cluster.rdoc
9524 *
9525 */
9526
9527static VALUE
9528rb_str_each_grapheme_cluster(VALUE str)
9529{
9530 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9531 return rb_str_enumerate_grapheme_clusters(str, 0);
9532}
9533
9534/*
9535 * call-seq:
9536 * grapheme_clusters -> array_of_grapheme_clusters
9537 *
9538 * :include: doc/string/grapheme_clusters.rdoc
9539 *
9540 */
9541
9542static VALUE
9543rb_str_grapheme_clusters(VALUE str)
9544{
9545 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9546 return rb_str_enumerate_grapheme_clusters(str, ary);
9547}
9548
9549static long
9550chopped_length(VALUE str)
9551{
9552 rb_encoding *enc = STR_ENC_GET(str);
9553 const char *p, *p2, *beg, *end;
9554
9555 beg = RSTRING_PTR(str);
9556 end = beg + RSTRING_LEN(str);
9557 if (beg >= end) return 0;
9558 p = rb_enc_prev_char(beg, end, end, enc);
9559 if (!p) return 0;
9560 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9561 p2 = rb_enc_prev_char(beg, p, end, enc);
9562 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9563 }
9564 return p - beg;
9565}
9566
9567/*
9568 * call-seq:
9569 * chop! -> self or nil
9570 *
9571 * Like String#chop, but modifies +self+ in place;
9572 * returns +nil+ if +self+ is empty, +self+ otherwise.
9573 *
9574 * Related: String#chomp!.
9575 */
9576
9577static VALUE
9578rb_str_chop_bang(VALUE str)
9579{
9580 str_modify_keep_cr(str);
9581 if (RSTRING_LEN(str) > 0) {
9582 long len;
9583 len = chopped_length(str);
9584 STR_SET_LEN(str, len);
9585 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9586 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9588 }
9589 return str;
9590 }
9591 return Qnil;
9592}
9593
9594
9595/*
9596 * call-seq:
9597 * chop -> new_string
9598 *
9599 * :include: doc/string/chop.rdoc
9600 *
9601 */
9602
9603static VALUE
9604rb_str_chop(VALUE str)
9605{
9606 return rb_str_subseq(str, 0, chopped_length(str));
9607}
9608
9609static long
9610smart_chomp(VALUE str, const char *e, const char *p)
9611{
9612 rb_encoding *enc = rb_enc_get(str);
9613 if (rb_enc_mbminlen(enc) > 1) {
9614 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9615 if (rb_enc_is_newline(pp, e, enc)) {
9616 e = pp;
9617 }
9618 pp = e - rb_enc_mbminlen(enc);
9619 if (pp >= p) {
9620 pp = rb_enc_left_char_head(p, pp, e, enc);
9621 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9622 e = pp;
9623 }
9624 }
9625 }
9626 else {
9627 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9628 case '\n':
9629 if (--e > p && *(e-1) == '\r') {
9630 --e;
9631 }
9632 break;
9633 case '\r':
9634 --e;
9635 break;
9636 }
9637 }
9638 return e - p;
9639}
9640
9641static long
9642chompped_length(VALUE str, VALUE rs)
9643{
9644 rb_encoding *enc;
9645 int newline;
9646 char *pp, *e, *rsptr;
9647 long rslen;
9648 char *const p = RSTRING_PTR(str);
9649 long len = RSTRING_LEN(str);
9650
9651 if (len == 0) return 0;
9652 e = p + len;
9653 if (rs == rb_default_rs) {
9654 return smart_chomp(str, e, p);
9655 }
9656
9657 enc = rb_enc_get(str);
9658 RSTRING_GETMEM(rs, rsptr, rslen);
9659 if (rslen == 0) {
9660 if (rb_enc_mbminlen(enc) > 1) {
9661 while (e > p) {
9662 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9663 if (!rb_enc_is_newline(pp, e, enc)) break;
9664 e = pp;
9665 pp -= rb_enc_mbminlen(enc);
9666 if (pp >= p) {
9667 pp = rb_enc_left_char_head(p, pp, e, enc);
9668 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9669 e = pp;
9670 }
9671 }
9672 }
9673 }
9674 else {
9675 while (e > p && *(e-1) == '\n') {
9676 --e;
9677 if (e > p && *(e-1) == '\r')
9678 --e;
9679 }
9680 }
9681 return e - p;
9682 }
9683 if (rslen > len) return len;
9684
9685 enc = rb_enc_get(rs);
9686 newline = rsptr[rslen-1];
9687 if (rslen == rb_enc_mbminlen(enc)) {
9688 if (rslen == 1) {
9689 if (newline == '\n')
9690 return smart_chomp(str, e, p);
9691 }
9692 else {
9693 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9694 return smart_chomp(str, e, p);
9695 }
9696 }
9697
9698 enc = rb_enc_check(str, rs);
9699 if (is_broken_string(rs)) {
9700 return len;
9701 }
9702 pp = e - rslen;
9703 if (p[len-1] == newline &&
9704 (rslen <= 1 ||
9705 memcmp(rsptr, pp, rslen) == 0)) {
9706 if (at_char_boundary(p, pp, e, enc))
9707 return len - rslen;
9708 RB_GC_GUARD(rs);
9709 }
9710 return len;
9711}
9712
9718static VALUE
9719chomp_rs(int argc, const VALUE *argv)
9720{
9721 rb_check_arity(argc, 0, 1);
9722 if (argc > 0) {
9723 VALUE rs = argv[0];
9724 if (!NIL_P(rs)) StringValue(rs);
9725 return rs;
9726 }
9727 else {
9728 return rb_rs;
9729 }
9730}
9731
9732VALUE
9733rb_str_chomp_string(VALUE str, VALUE rs)
9734{
9735 long olen = RSTRING_LEN(str);
9736 long len = chompped_length(str, rs);
9737 if (len >= olen) return Qnil;
9738 str_modify_keep_cr(str);
9739 STR_SET_LEN(str, len);
9740 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9741 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9743 }
9744 return str;
9745}
9746
9747/*
9748 * call-seq:
9749 * chomp!(line_sep = $/) -> self or nil
9750 *
9751 * Like String#chomp, but modifies +self+ in place;
9752 * returns +nil+ if no modification made, +self+ otherwise.
9753 *
9754 */
9755
9756static VALUE
9757rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9758{
9759 VALUE rs;
9760 str_modifiable(str);
9761 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9762 rs = chomp_rs(argc, argv);
9763 if (NIL_P(rs)) return Qnil;
9764 return rb_str_chomp_string(str, rs);
9765}
9766
9767
9768/*
9769 * call-seq:
9770 * chomp(line_sep = $/) -> new_string
9771 *
9772 * :include: doc/string/chomp.rdoc
9773 *
9774 */
9775
9776static VALUE
9777rb_str_chomp(int argc, VALUE *argv, VALUE str)
9778{
9779 VALUE rs = chomp_rs(argc, argv);
9780 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9781 return rb_str_subseq(str, 0, chompped_length(str, rs));
9782}
9783
9784static long
9785lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9786{
9787 const char *const start = s;
9788
9789 if (!s || s >= e) return 0;
9790
9791 /* remove spaces at head */
9792 if (single_byte_optimizable(str)) {
9793 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9794 }
9795 else {
9796 while (s < e) {
9797 int n;
9798 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9799
9800 if (cc && !rb_isspace(cc)) break;
9801 s += n;
9802 }
9803 }
9804 return s - start;
9805}
9806
9807/*
9808 * call-seq:
9809 * lstrip! -> self or nil
9810 *
9811 * Like String#lstrip, except that any modifications are made in +self+;
9812 * returns +self+ if any modification are made, +nil+ otherwise.
9813 *
9814 * Related: String#rstrip!, String#strip!.
9815 */
9816
9817static VALUE
9818rb_str_lstrip_bang(VALUE str)
9819{
9820 rb_encoding *enc;
9821 char *start, *s;
9822 long olen, loffset;
9823
9824 str_modify_keep_cr(str);
9825 enc = STR_ENC_GET(str);
9826 RSTRING_GETMEM(str, start, olen);
9827 loffset = lstrip_offset(str, start, start+olen, enc);
9828 if (loffset > 0) {
9829 long len = olen-loffset;
9830 s = start + loffset;
9831 memmove(start, s, len);
9832 STR_SET_LEN(str, len);
9833 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9834 return str;
9835 }
9836 return Qnil;
9837}
9838
9839
9840/*
9841 * call-seq:
9842 * lstrip -> new_string
9843 *
9844 * Returns a copy of +self+ with leading whitespace removed;
9845 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9846 *
9847 * whitespace = "\x00\t\n\v\f\r "
9848 * s = whitespace + 'abc' + whitespace
9849 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9850 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9851 *
9852 * Related: String#rstrip, String#strip.
9853 */
9854
9855static VALUE
9856rb_str_lstrip(VALUE str)
9857{
9858 char *start;
9859 long len, loffset;
9860 RSTRING_GETMEM(str, start, len);
9861 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9862 if (loffset <= 0) return str_duplicate(rb_cString, str);
9863 return rb_str_subseq(str, loffset, len - loffset);
9864}
9865
9866static long
9867rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9868{
9869 const char *t;
9870
9871 rb_str_check_dummy_enc(enc);
9873 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9874 }
9875 if (!s || s >= e) return 0;
9876 t = e;
9877
9878 /* remove trailing spaces or '\0's */
9879 if (single_byte_optimizable(str)) {
9880 unsigned char c;
9881 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9882 }
9883 else {
9884 char *tp;
9885
9886 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9887 unsigned int c = rb_enc_codepoint(tp, e, enc);
9888 if (c && !rb_isspace(c)) break;
9889 t = tp;
9890 }
9891 }
9892 return e - t;
9893}
9894
9895/*
9896 * call-seq:
9897 * rstrip! -> self or nil
9898 *
9899 * Like String#rstrip, except that any modifications are made in +self+;
9900 * returns +self+ if any modification are made, +nil+ otherwise.
9901 *
9902 * Related: String#lstrip!, String#strip!.
9903 */
9904
9905static VALUE
9906rb_str_rstrip_bang(VALUE str)
9907{
9908 rb_encoding *enc;
9909 char *start;
9910 long olen, roffset;
9911
9912 str_modify_keep_cr(str);
9913 enc = STR_ENC_GET(str);
9914 RSTRING_GETMEM(str, start, olen);
9915 roffset = rstrip_offset(str, start, start+olen, enc);
9916 if (roffset > 0) {
9917 long len = olen - roffset;
9918
9919 STR_SET_LEN(str, len);
9920 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9921 return str;
9922 }
9923 return Qnil;
9924}
9925
9926
9927/*
9928 * call-seq:
9929 * rstrip -> new_string
9930 *
9931 * Returns a copy of the receiver with trailing whitespace removed;
9932 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9933 *
9934 * whitespace = "\x00\t\n\v\f\r "
9935 * s = whitespace + 'abc' + whitespace
9936 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9937 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9938 *
9939 * Related: String#lstrip, String#strip.
9940 */
9941
9942static VALUE
9943rb_str_rstrip(VALUE str)
9944{
9945 rb_encoding *enc;
9946 char *start;
9947 long olen, roffset;
9948
9949 enc = STR_ENC_GET(str);
9950 RSTRING_GETMEM(str, start, olen);
9951 roffset = rstrip_offset(str, start, start+olen, enc);
9952
9953 if (roffset <= 0) return str_duplicate(rb_cString, str);
9954 return rb_str_subseq(str, 0, olen-roffset);
9955}
9956
9957
9958/*
9959 * call-seq:
9960 * strip! -> self or nil
9961 *
9962 * Like String#strip, except that any modifications are made in +self+;
9963 * returns +self+ if any modification are made, +nil+ otherwise.
9964 *
9965 * Related: String#lstrip!, String#strip!.
9966 */
9967
9968static VALUE
9969rb_str_strip_bang(VALUE str)
9970{
9971 char *start;
9972 long olen, loffset, roffset;
9973 rb_encoding *enc;
9974
9975 str_modify_keep_cr(str);
9976 enc = STR_ENC_GET(str);
9977 RSTRING_GETMEM(str, start, olen);
9978 loffset = lstrip_offset(str, start, start+olen, enc);
9979 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9980
9981 if (loffset > 0 || roffset > 0) {
9982 long len = olen-roffset;
9983 if (loffset > 0) {
9984 len -= loffset;
9985 memmove(start, start + loffset, len);
9986 }
9987 STR_SET_LEN(str, len);
9988 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9989 return str;
9990 }
9991 return Qnil;
9992}
9993
9994
9995/*
9996 * call-seq:
9997 * strip -> new_string
9998 *
9999 * Returns a copy of the receiver with leading and trailing whitespace removed;
10000 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
10001 *
10002 * whitespace = "\x00\t\n\v\f\r "
10003 * s = whitespace + 'abc' + whitespace
10004 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
10005 * s.strip # => "abc"
10006 *
10007 * Related: String#lstrip, String#rstrip.
10008 */
10009
10010static VALUE
10011rb_str_strip(VALUE str)
10012{
10013 char *start;
10014 long olen, loffset, roffset;
10015 rb_encoding *enc = STR_ENC_GET(str);
10016
10017 RSTRING_GETMEM(str, start, olen);
10018 loffset = lstrip_offset(str, start, start+olen, enc);
10019 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
10020
10021 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
10022 return rb_str_subseq(str, loffset, olen-loffset-roffset);
10023}
10024
10025static VALUE
10026scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
10027{
10028 VALUE result = Qnil;
10029 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
10030 if (pos >= 0) {
10031 VALUE match;
10032 struct re_registers *regs;
10033 if (BUILTIN_TYPE(pat) == T_STRING) {
10034 regs = NULL;
10035 end = pos + RSTRING_LEN(pat);
10036 }
10037 else {
10038 match = rb_backref_get();
10039 regs = RMATCH_REGS(match);
10040 pos = BEG(0);
10041 end = END(0);
10042 }
10043
10044 if (pos == end) {
10045 rb_encoding *enc = STR_ENC_GET(str);
10046 /*
10047 * Always consume at least one character of the input string
10048 */
10049 if (RSTRING_LEN(str) > end)
10050 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
10051 RSTRING_END(str), enc);
10052 else
10053 *start = end + 1;
10054 }
10055 else {
10056 *start = end;
10057 }
10058
10059 if (!regs || regs->num_regs == 1) {
10060 result = rb_str_subseq(str, pos, end - pos);
10061 return result;
10062 }
10063 else {
10064 result = rb_ary_new2(regs->num_regs);
10065 for (int i = 1; i < regs->num_regs; i++) {
10066 VALUE s = Qnil;
10067 if (BEG(i) >= 0) {
10068 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10069 }
10070
10071 rb_ary_push(result, s);
10072 }
10073 }
10074
10075 RB_GC_GUARD(match);
10076 }
10077
10078 return result;
10079}
10080
10081
10082/*
10083 * call-seq:
10084 * scan(string_or_regexp) -> array
10085 * scan(string_or_regexp) {|matches| ... } -> self
10086 *
10087 * Matches a pattern against +self+; the pattern is:
10088 *
10089 * - +string_or_regexp+ itself, if it is a Regexp.
10090 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10091 *
10092 * Iterates through +self+, generating a collection of matching results:
10093 *
10094 * - If the pattern contains no groups, each result is the
10095 * matched string, <code>$&</code>.
10096 * - If the pattern contains groups, each result is an array
10097 * containing one entry per group.
10098 *
10099 * With no block given, returns an array of the results:
10100 *
10101 * s = 'cruel world'
10102 * s.scan(/\w+/) # => ["cruel", "world"]
10103 * s.scan(/.../) # => ["cru", "el ", "wor"]
10104 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10105 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10106 *
10107 * With a block given, calls the block with each result; returns +self+:
10108 *
10109 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10110 * print "\n"
10111 * s.scan(/(.)(.)/) {|x,y| print y, x }
10112 * print "\n"
10113 *
10114 * Output:
10115 *
10116 * <<cruel>> <<world>>
10117 * rceu lowlr
10118 *
10119 */
10120
10121static VALUE
10122rb_str_scan(VALUE str, VALUE pat)
10123{
10124 VALUE result;
10125 long start = 0;
10126 long last = -1, prev = 0;
10127 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10128
10129 pat = get_pat_quoted(pat, 1);
10130 mustnot_broken(str);
10131 if (!rb_block_given_p()) {
10132 VALUE ary = rb_ary_new();
10133
10134 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10135 last = prev;
10136 prev = start;
10137 rb_ary_push(ary, result);
10138 }
10139 if (last >= 0) rb_pat_search(pat, str, last, 1);
10140 else rb_backref_set(Qnil);
10141 return ary;
10142 }
10143
10144 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10145 last = prev;
10146 prev = start;
10147 rb_yield(result);
10148 str_mod_check(str, p, len);
10149 }
10150 if (last >= 0) rb_pat_search(pat, str, last, 1);
10151 return str;
10152}
10153
10154
10155/*
10156 * call-seq:
10157 * hex -> integer
10158 *
10159 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10160 * (with an optional sign and an optional <code>0x</code>) and returns the
10161 * corresponding number;
10162 * returns zero if there is no such leading substring:
10163 *
10164 * '0x0a'.hex # => 10
10165 * '-1234'.hex # => -4660
10166 * '0'.hex # => 0
10167 * 'non-numeric'.hex # => 0
10168 *
10169 * Related: String#oct.
10170 *
10171 */
10172
10173static VALUE
10174rb_str_hex(VALUE str)
10175{
10176 return rb_str_to_inum(str, 16, FALSE);
10177}
10178
10179
10180/*
10181 * call-seq:
10182 * oct -> integer
10183 *
10184 * Interprets the leading substring of +self+ as a string of octal digits
10185 * (with an optional sign) and returns the corresponding number;
10186 * returns zero if there is no such leading substring:
10187 *
10188 * '123'.oct # => 83
10189 * '-377'.oct # => -255
10190 * '0377non-numeric'.oct # => 255
10191 * 'non-numeric'.oct # => 0
10192 *
10193 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10194 * see Kernel#Integer.
10195 *
10196 * Related: String#hex.
10197 *
10198 */
10199
10200static VALUE
10201rb_str_oct(VALUE str)
10202{
10203 return rb_str_to_inum(str, -8, FALSE);
10204}
10205
10206#ifndef HAVE_CRYPT_R
10207# include "ruby/thread_native.h"
10208# include "ruby/atomic.h"
10209
10210static struct {
10211 rb_nativethread_lock_t lock;
10212} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10213
10214static void
10215crypt_mutex_initialize(void)
10216{
10217}
10218#endif
10219
10220/*
10221 * call-seq:
10222 * crypt(salt_str) -> new_string
10223 *
10224 * Returns the string generated by calling <code>crypt(3)</code>
10225 * standard library function with <code>str</code> and
10226 * <code>salt_str</code>, in this order, as its arguments. Please do
10227 * not use this method any longer. It is legacy; provided only for
10228 * backward compatibility with ruby scripts in earlier days. It is
10229 * bad to use in contemporary programs for several reasons:
10230 *
10231 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10232 * run. The generated string lacks data portability.
10233 *
10234 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10235 * (i.e. silently ends up in unexpected results).
10236 *
10237 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10238 * thread safe.
10239 *
10240 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10241 * very very weak. According to its manpage, Linux's traditional
10242 * <code>crypt(3)</code> output has only 2**56 variations; too
10243 * easy to brute force today. And this is the default behaviour.
10244 *
10245 * * In order to make things robust some OSes implement so-called
10246 * "modular" usage. To go through, you have to do a complex
10247 * build-up of the <code>salt_str</code> parameter, by hand.
10248 * Failure in generation of a proper salt string tends not to
10249 * yield any errors; typos in parameters are normally not
10250 * detectable.
10251 *
10252 * * For instance, in the following example, the second invocation
10253 * of String#crypt is wrong; it has a typo in "round=" (lacks
10254 * "s"). However the call does not fail and something unexpected
10255 * is generated.
10256 *
10257 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10258 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10259 *
10260 * * Even in the "modular" mode, some hash functions are considered
10261 * archaic and no longer recommended at all; for instance module
10262 * <code>$1$</code> is officially abandoned by its author: see
10263 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10264 * instance module <code>$3$</code> is considered completely
10265 * broken: see the manpage of FreeBSD.
10266 *
10267 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10268 * written above, <code>crypt(3)</code> on Mac OS never fails.
10269 * This means even if you build up a proper salt string it
10270 * generates a traditional DES hash anyways, and there is no way
10271 * for you to be aware of.
10272 *
10273 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10274 *
10275 * If for some reason you cannot migrate to other secure contemporary
10276 * password hashing algorithms, install the string-crypt gem and
10277 * <code>require 'string/crypt'</code> to continue using it.
10278 */
10279
10280static VALUE
10281rb_str_crypt(VALUE str, VALUE salt)
10282{
10283#ifdef HAVE_CRYPT_R
10284 VALUE databuf;
10285 struct crypt_data *data;
10286# define CRYPT_END() ALLOCV_END(databuf)
10287#else
10288 extern char *crypt(const char *, const char *);
10289# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10290#endif
10291 VALUE result;
10292 const char *s, *saltp;
10293 char *res;
10294#ifdef BROKEN_CRYPT
10295 char salt_8bit_clean[3];
10296#endif
10297
10298 StringValue(salt);
10299 mustnot_wchar(str);
10300 mustnot_wchar(salt);
10301 s = StringValueCStr(str);
10302 saltp = RSTRING_PTR(salt);
10303 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10304 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10305 }
10306
10307#ifdef BROKEN_CRYPT
10308 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10309 salt_8bit_clean[0] = saltp[0] & 0x7f;
10310 salt_8bit_clean[1] = saltp[1] & 0x7f;
10311 salt_8bit_clean[2] = '\0';
10312 saltp = salt_8bit_clean;
10313 }
10314#endif
10315#ifdef HAVE_CRYPT_R
10316 data = ALLOCV(databuf, sizeof(struct crypt_data));
10317# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10318 data->initialized = 0;
10319# endif
10320 res = crypt_r(s, saltp, data);
10321#else
10322 crypt_mutex_initialize();
10323 rb_nativethread_lock_lock(&crypt_mutex.lock);
10324 res = crypt(s, saltp);
10325#endif
10326 if (!res) {
10327 int err = errno;
10328 CRYPT_END();
10329 rb_syserr_fail(err, "crypt");
10330 }
10331 result = rb_str_new_cstr(res);
10332 CRYPT_END();
10333 return result;
10334}
10335
10336
10337/*
10338 * call-seq:
10339 * ord -> integer
10340 *
10341 * :include: doc/string/ord.rdoc
10342 *
10343 */
10344
10345static VALUE
10346rb_str_ord(VALUE s)
10347{
10348 unsigned int c;
10349
10350 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10351 return UINT2NUM(c);
10352}
10353/*
10354 * call-seq:
10355 * sum(n = 16) -> integer
10356 *
10357 * :include: doc/string/sum.rdoc
10358 *
10359 */
10360
10361static VALUE
10362rb_str_sum(int argc, VALUE *argv, VALUE str)
10363{
10364 int bits = 16;
10365 char *ptr, *p, *pend;
10366 long len;
10367 VALUE sum = INT2FIX(0);
10368 unsigned long sum0 = 0;
10369
10370 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10371 bits = 0;
10372 }
10373 ptr = p = RSTRING_PTR(str);
10374 len = RSTRING_LEN(str);
10375 pend = p + len;
10376
10377 while (p < pend) {
10378 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10379 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10380 str_mod_check(str, ptr, len);
10381 sum0 = 0;
10382 }
10383 sum0 += (unsigned char)*p;
10384 p++;
10385 }
10386
10387 if (bits == 0) {
10388 if (sum0) {
10389 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10390 }
10391 }
10392 else {
10393 if (sum == INT2FIX(0)) {
10394 if (bits < (int)sizeof(long)*CHAR_BIT) {
10395 sum0 &= (((unsigned long)1)<<bits)-1;
10396 }
10397 sum = LONG2FIX(sum0);
10398 }
10399 else {
10400 VALUE mod;
10401
10402 if (sum0) {
10403 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10404 }
10405
10406 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10407 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10408 sum = rb_funcall(sum, '&', 1, mod);
10409 }
10410 }
10411 return sum;
10412}
10413
10414static VALUE
10415rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10416{
10417 rb_encoding *enc;
10418 VALUE w;
10419 long width, len, flen = 1, fclen = 1;
10420 VALUE res;
10421 char *p;
10422 const char *f = " ";
10423 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10424 VALUE pad;
10425 int singlebyte = 1, cr;
10426 int termlen;
10427
10428 rb_scan_args(argc, argv, "11", &w, &pad);
10429 enc = STR_ENC_GET(str);
10430 termlen = rb_enc_mbminlen(enc);
10431 width = NUM2LONG(w);
10432 if (argc == 2) {
10433 StringValue(pad);
10434 enc = rb_enc_check(str, pad);
10435 f = RSTRING_PTR(pad);
10436 flen = RSTRING_LEN(pad);
10437 fclen = str_strlen(pad, enc); /* rb_enc_check */
10438 singlebyte = single_byte_optimizable(pad);
10439 if (flen == 0 || fclen == 0) {
10440 rb_raise(rb_eArgError, "zero width padding");
10441 }
10442 }
10443 len = str_strlen(str, enc); /* rb_enc_check */
10444 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10445 n = width - len;
10446 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10447 rlen = n - llen;
10448 cr = ENC_CODERANGE(str);
10449 if (flen > 1) {
10450 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10451 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10452 }
10453 size = RSTRING_LEN(str);
10454 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10455 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10456 (len += llen2 + rlen2) >= LONG_MAX - size) {
10457 rb_raise(rb_eArgError, "argument too big");
10458 }
10459 len += size;
10460 res = str_new0(rb_cString, 0, len, termlen);
10461 p = RSTRING_PTR(res);
10462 if (flen <= 1) {
10463 memset(p, *f, llen);
10464 p += llen;
10465 }
10466 else {
10467 while (llen >= fclen) {
10468 memcpy(p,f,flen);
10469 p += flen;
10470 llen -= fclen;
10471 }
10472 if (llen > 0) {
10473 memcpy(p, f, llen2);
10474 p += llen2;
10475 }
10476 }
10477 memcpy(p, RSTRING_PTR(str), size);
10478 p += size;
10479 if (flen <= 1) {
10480 memset(p, *f, rlen);
10481 p += rlen;
10482 }
10483 else {
10484 while (rlen >= fclen) {
10485 memcpy(p,f,flen);
10486 p += flen;
10487 rlen -= fclen;
10488 }
10489 if (rlen > 0) {
10490 memcpy(p, f, rlen2);
10491 p += rlen2;
10492 }
10493 }
10494 TERM_FILL(p, termlen);
10495 STR_SET_LEN(res, p-RSTRING_PTR(res));
10496 rb_enc_associate(res, enc);
10497 if (argc == 2)
10498 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10499 if (cr != ENC_CODERANGE_BROKEN)
10500 ENC_CODERANGE_SET(res, cr);
10501
10502 RB_GC_GUARD(pad);
10503 return res;
10504}
10505
10506
10507/*
10508 * call-seq:
10509 * ljust(size, pad_string = ' ') -> new_string
10510 *
10511 * :include: doc/string/ljust.rdoc
10512 *
10513 * Related: String#rjust, String#center.
10514 *
10515 */
10516
10517static VALUE
10518rb_str_ljust(int argc, VALUE *argv, VALUE str)
10519{
10520 return rb_str_justify(argc, argv, str, 'l');
10521}
10522
10523/*
10524 * call-seq:
10525 * rjust(size, pad_string = ' ') -> new_string
10526 *
10527 * :include: doc/string/rjust.rdoc
10528 *
10529 * Related: String#ljust, String#center.
10530 *
10531 */
10532
10533static VALUE
10534rb_str_rjust(int argc, VALUE *argv, VALUE str)
10535{
10536 return rb_str_justify(argc, argv, str, 'r');
10537}
10538
10539
10540/*
10541 * call-seq:
10542 * center(size, pad_string = ' ') -> new_string
10543 *
10544 * :include: doc/string/center.rdoc
10545 *
10546 * Related: String#ljust, String#rjust.
10547 *
10548 */
10549
10550static VALUE
10551rb_str_center(int argc, VALUE *argv, VALUE str)
10552{
10553 return rb_str_justify(argc, argv, str, 'c');
10554}
10555
10556/*
10557 * call-seq:
10558 * partition(string_or_regexp) -> [head, match, tail]
10559 *
10560 * :include: doc/string/partition.rdoc
10561 *
10562 */
10563
10564static VALUE
10565rb_str_partition(VALUE str, VALUE sep)
10566{
10567 long pos;
10568
10569 sep = get_pat_quoted(sep, 0);
10570 if (RB_TYPE_P(sep, T_REGEXP)) {
10571 if (rb_reg_search(sep, str, 0, 0) < 0) {
10572 goto failed;
10573 }
10574 VALUE match = rb_backref_get();
10575 struct re_registers *regs = RMATCH_REGS(match);
10576
10577 pos = BEG(0);
10578 sep = rb_str_subseq(str, pos, END(0) - pos);
10579 }
10580 else {
10581 pos = rb_str_index(str, sep, 0);
10582 if (pos < 0) goto failed;
10583 }
10584 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10585 sep,
10586 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10587 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10588
10589 failed:
10590 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10591}
10592
10593/*
10594 * call-seq:
10595 * rpartition(sep) -> [head, match, tail]
10596 *
10597 * :include: doc/string/rpartition.rdoc
10598 *
10599 */
10600
10601static VALUE
10602rb_str_rpartition(VALUE str, VALUE sep)
10603{
10604 long pos = RSTRING_LEN(str);
10605
10606 sep = get_pat_quoted(sep, 0);
10607 if (RB_TYPE_P(sep, T_REGEXP)) {
10608 if (rb_reg_search(sep, str, pos, 1) < 0) {
10609 goto failed;
10610 }
10611 VALUE match = rb_backref_get();
10612 struct re_registers *regs = RMATCH_REGS(match);
10613
10614 pos = BEG(0);
10615 sep = rb_str_subseq(str, pos, END(0) - pos);
10616 }
10617 else {
10618 pos = rb_str_sublen(str, pos);
10619 pos = rb_str_rindex(str, sep, pos);
10620 if (pos < 0) {
10621 goto failed;
10622 }
10623 }
10624
10625 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10626 sep,
10627 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10628 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10629 failed:
10630 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10631}
10632
10633/*
10634 * call-seq:
10635 * start_with?(*string_or_regexp) -> true or false
10636 *
10637 * :include: doc/string/start_with_p.rdoc
10638 *
10639 */
10640
10641static VALUE
10642rb_str_start_with(int argc, VALUE *argv, VALUE str)
10643{
10644 int i;
10645
10646 for (i=0; i<argc; i++) {
10647 VALUE tmp = argv[i];
10648 if (RB_TYPE_P(tmp, T_REGEXP)) {
10649 if (rb_reg_start_with_p(tmp, str))
10650 return Qtrue;
10651 }
10652 else {
10653 const char *p, *s, *e;
10654 long slen, tlen;
10655 rb_encoding *enc;
10656
10657 StringValue(tmp);
10658 enc = rb_enc_check(str, tmp);
10659 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10660 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10661 p = RSTRING_PTR(str);
10662 e = p + slen;
10663 s = p + tlen;
10664 if (!at_char_right_boundary(p, s, e, enc))
10665 continue;
10666 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10667 return Qtrue;
10668 }
10669 }
10670 return Qfalse;
10671}
10672
10673/*
10674 * call-seq:
10675 * end_with?(*strings) -> true or false
10676 *
10677 * :include: doc/string/end_with_p.rdoc
10678 *
10679 */
10680
10681static VALUE
10682rb_str_end_with(int argc, VALUE *argv, VALUE str)
10683{
10684 int i;
10685
10686 for (i=0; i<argc; i++) {
10687 VALUE tmp = argv[i];
10688 const char *p, *s, *e;
10689 long slen, tlen;
10690 rb_encoding *enc;
10691
10692 StringValue(tmp);
10693 enc = rb_enc_check(str, tmp);
10694 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10695 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10696 p = RSTRING_PTR(str);
10697 e = p + slen;
10698 s = e - tlen;
10699 if (!at_char_boundary(p, s, e, enc))
10700 continue;
10701 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10702 return Qtrue;
10703 }
10704 return Qfalse;
10705}
10706
10716static long
10717deleted_prefix_length(VALUE str, VALUE prefix)
10718{
10719 const char *strptr, *prefixptr;
10720 long olen, prefixlen;
10721 rb_encoding *enc = rb_enc_get(str);
10722
10723 StringValue(prefix);
10724
10725 if (!is_broken_string(prefix) ||
10726 !rb_enc_asciicompat(enc) ||
10727 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10728 enc = rb_enc_check(str, prefix);
10729 }
10730
10731 /* return 0 if not start with prefix */
10732 prefixlen = RSTRING_LEN(prefix);
10733 if (prefixlen <= 0) return 0;
10734 olen = RSTRING_LEN(str);
10735 if (olen < prefixlen) return 0;
10736 strptr = RSTRING_PTR(str);
10737 prefixptr = RSTRING_PTR(prefix);
10738 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10739 if (is_broken_string(prefix)) {
10740 if (!is_broken_string(str)) {
10741 /* prefix in a valid string cannot be broken */
10742 return 0;
10743 }
10744 const char *strend = strptr + olen;
10745 const char *after_prefix = strptr + prefixlen;
10746 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10747 /* prefix does not end at char-boundary */
10748 return 0;
10749 }
10750 }
10751 /* prefix part in `str` also should be valid. */
10752
10753 return prefixlen;
10754}
10755
10756/*
10757 * call-seq:
10758 * delete_prefix!(prefix) -> self or nil
10759 *
10760 * Like String#delete_prefix, except that +self+ is modified in place.
10761 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10762 *
10763 */
10764
10765static VALUE
10766rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10767{
10768 long prefixlen;
10769 str_modify_keep_cr(str);
10770
10771 prefixlen = deleted_prefix_length(str, prefix);
10772 if (prefixlen <= 0) return Qnil;
10773
10774 return rb_str_drop_bytes(str, prefixlen);
10775}
10776
10777/*
10778 * call-seq:
10779 * delete_prefix(prefix) -> new_string
10780 *
10781 * :include: doc/string/delete_prefix.rdoc
10782 *
10783 */
10784
10785static VALUE
10786rb_str_delete_prefix(VALUE str, VALUE prefix)
10787{
10788 long prefixlen;
10789
10790 prefixlen = deleted_prefix_length(str, prefix);
10791 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10792
10793 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10794}
10795
10805static long
10806deleted_suffix_length(VALUE str, VALUE suffix)
10807{
10808 const char *strptr, *suffixptr;
10809 long olen, suffixlen;
10810 rb_encoding *enc;
10811
10812 StringValue(suffix);
10813 if (is_broken_string(suffix)) return 0;
10814 enc = rb_enc_check(str, suffix);
10815
10816 /* return 0 if not start with suffix */
10817 suffixlen = RSTRING_LEN(suffix);
10818 if (suffixlen <= 0) return 0;
10819 olen = RSTRING_LEN(str);
10820 if (olen < suffixlen) return 0;
10821 strptr = RSTRING_PTR(str);
10822 suffixptr = RSTRING_PTR(suffix);
10823 const char *strend = strptr + olen;
10824 const char *before_suffix = strend - suffixlen;
10825 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10826 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10827
10828 return suffixlen;
10829}
10830
10831/*
10832 * call-seq:
10833 * delete_suffix!(suffix) -> self or nil
10834 *
10835 * Like String#delete_suffix, except that +self+ is modified in place.
10836 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10837 *
10838 */
10839
10840static VALUE
10841rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10842{
10843 long olen, suffixlen, len;
10844 str_modifiable(str);
10845
10846 suffixlen = deleted_suffix_length(str, suffix);
10847 if (suffixlen <= 0) return Qnil;
10848
10849 olen = RSTRING_LEN(str);
10850 str_modify_keep_cr(str);
10851 len = olen - suffixlen;
10852 STR_SET_LEN(str, len);
10853 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10854 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10856 }
10857 return str;
10858}
10859
10860/*
10861 * call-seq:
10862 * delete_suffix(suffix) -> new_string
10863 *
10864 * :include: doc/string/delete_suffix.rdoc
10865 *
10866 */
10867
10868static VALUE
10869rb_str_delete_suffix(VALUE str, VALUE suffix)
10870{
10871 long suffixlen;
10872
10873 suffixlen = deleted_suffix_length(str, suffix);
10874 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10875
10876 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10877}
10878
10879void
10880rb_str_setter(VALUE val, ID id, VALUE *var)
10881{
10882 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10883 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10884 }
10885 *var = val;
10886}
10887
10888static void
10889rb_fs_setter(VALUE val, ID id, VALUE *var)
10890{
10891 val = rb_fs_check(val);
10892 if (!val) {
10893 rb_raise(rb_eTypeError,
10894 "value of %"PRIsVALUE" must be String or Regexp",
10895 rb_id2str(id));
10896 }
10897 if (!NIL_P(val)) {
10898 rb_warn_deprecated("`$;'", NULL);
10899 }
10900 *var = val;
10901}
10902
10903
10904/*
10905 * call-seq:
10906 * force_encoding(encoding) -> self
10907 *
10908 * :include: doc/string/force_encoding.rdoc
10909 *
10910 */
10911
10912static VALUE
10913rb_str_force_encoding(VALUE str, VALUE enc)
10914{
10915 str_modifiable(str);
10916
10917 rb_encoding *encoding = rb_to_encoding(enc);
10918 int idx = rb_enc_to_index(encoding);
10919
10920 // If the encoding is unchanged, we do nothing.
10921 if (ENCODING_GET(str) == idx) {
10922 return str;
10923 }
10924
10925 rb_enc_associate_index(str, idx);
10926
10927 // If the coderange was 7bit and the new encoding is ASCII-compatible
10928 // we can keep the coderange.
10929 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10930 return str;
10931 }
10932
10934 return str;
10935}
10936
10937/*
10938 * call-seq:
10939 * b -> string
10940 *
10941 * :include: doc/string/b.rdoc
10942 *
10943 */
10944
10945static VALUE
10946rb_str_b(VALUE str)
10947{
10948 VALUE str2;
10949 if (STR_EMBED_P(str)) {
10950 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
10951 }
10952 else {
10953 str2 = str_alloc_heap(rb_cString);
10954 }
10955 str_replace_shared_without_enc(str2, str);
10956
10957 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10958 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10959 // If we know the receiver's code range then we know the result's code range.
10960 int cr = ENC_CODERANGE(str);
10961 switch (cr) {
10962 case ENC_CODERANGE_7BIT:
10964 break;
10968 break;
10969 default:
10970 ENC_CODERANGE_CLEAR(str2);
10971 break;
10972 }
10973 }
10974
10975 return str2;
10976}
10977
10978/*
10979 * call-seq:
10980 * valid_encoding? -> true or false
10981 *
10982 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10983 *
10984 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10985 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10986 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10987 */
10988
10989static VALUE
10990rb_str_valid_encoding_p(VALUE str)
10991{
10992 int cr = rb_enc_str_coderange(str);
10993
10994 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10995}
10996
10997/*
10998 * call-seq:
10999 * ascii_only? -> true or false
11000 *
11001 * Returns +true+ if +self+ contains only ASCII characters,
11002 * +false+ otherwise:
11003 *
11004 * 'abc'.ascii_only? # => true
11005 * "abc\u{6666}".ascii_only? # => false
11006 *
11007 */
11008
11009static VALUE
11010rb_str_is_ascii_only_p(VALUE str)
11011{
11012 int cr = rb_enc_str_coderange(str);
11013
11014 return RBOOL(cr == ENC_CODERANGE_7BIT);
11015}
11016
11017VALUE
11019{
11020 static const char ellipsis[] = "...";
11021 const long ellipsislen = sizeof(ellipsis) - 1;
11022 rb_encoding *const enc = rb_enc_get(str);
11023 const long blen = RSTRING_LEN(str);
11024 const char *const p = RSTRING_PTR(str), *e = p + blen;
11025 VALUE estr, ret = 0;
11026
11027 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
11028 if (len * rb_enc_mbminlen(enc) >= blen ||
11029 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
11030 ret = str;
11031 }
11032 else if (len <= ellipsislen ||
11033 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
11034 if (rb_enc_asciicompat(enc)) {
11035 ret = rb_str_new(ellipsis, len);
11036 rb_enc_associate(ret, enc);
11037 }
11038 else {
11039 estr = rb_usascii_str_new(ellipsis, len);
11040 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
11041 }
11042 }
11043 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
11044 rb_str_cat(ret, ellipsis, ellipsislen);
11045 }
11046 else {
11047 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
11048 rb_enc_from_encoding(enc), 0, Qnil);
11049 rb_str_append(ret, estr);
11050 }
11051 return ret;
11052}
11053
11054static VALUE
11055str_compat_and_valid(VALUE str, rb_encoding *enc)
11056{
11057 int cr;
11058 str = StringValue(str);
11059 cr = rb_enc_str_coderange(str);
11060 if (cr == ENC_CODERANGE_BROKEN) {
11061 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
11062 }
11063 else {
11064 rb_encoding *e = STR_ENC_GET(str);
11065 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11066 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11067 rb_enc_name(enc), rb_enc_name(e));
11068 }
11069 }
11070 return str;
11071}
11072
11073static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11074
11075VALUE
11077{
11078 rb_encoding *enc = STR_ENC_GET(str);
11079 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11080}
11081
11082VALUE
11083rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11084{
11085 int cr = ENC_CODERANGE_UNKNOWN;
11086 if (enc == STR_ENC_GET(str)) {
11087 /* cached coderange makes sense only when enc equals the
11088 * actual encoding of str */
11089 cr = ENC_CODERANGE(str);
11090 }
11091 return enc_str_scrub(enc, str, repl, cr);
11092}
11093
11094static VALUE
11095enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11096{
11097 int encidx;
11098 VALUE buf = Qnil;
11099 const char *rep, *p, *e, *p1, *sp;
11100 long replen = -1;
11101 long slen;
11102
11103 if (rb_block_given_p()) {
11104 if (!NIL_P(repl))
11105 rb_raise(rb_eArgError, "both of block and replacement given");
11106 replen = 0;
11107 }
11108
11109 if (ENC_CODERANGE_CLEAN_P(cr))
11110 return Qnil;
11111
11112 if (!NIL_P(repl)) {
11113 repl = str_compat_and_valid(repl, enc);
11114 }
11115
11116 if (rb_enc_dummy_p(enc)) {
11117 return Qnil;
11118 }
11119 encidx = rb_enc_to_index(enc);
11120
11121#define DEFAULT_REPLACE_CHAR(str) do { \
11122 static const char replace[sizeof(str)-1] = str; \
11123 rep = replace; replen = (int)sizeof(replace); \
11124 } while (0)
11125
11126 slen = RSTRING_LEN(str);
11127 p = RSTRING_PTR(str);
11128 e = RSTRING_END(str);
11129 p1 = p;
11130 sp = p;
11131
11132 if (rb_enc_asciicompat(enc)) {
11133 int rep7bit_p;
11134 if (!replen) {
11135 rep = NULL;
11136 rep7bit_p = FALSE;
11137 }
11138 else if (!NIL_P(repl)) {
11139 rep = RSTRING_PTR(repl);
11140 replen = RSTRING_LEN(repl);
11141 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11142 }
11143 else if (encidx == rb_utf8_encindex()) {
11144 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11145 rep7bit_p = FALSE;
11146 }
11147 else {
11148 DEFAULT_REPLACE_CHAR("?");
11149 rep7bit_p = TRUE;
11150 }
11151 cr = ENC_CODERANGE_7BIT;
11152
11153 p = search_nonascii(p, e);
11154 if (!p) {
11155 p = e;
11156 }
11157 while (p < e) {
11158 int ret = rb_enc_precise_mbclen(p, e, enc);
11159 if (MBCLEN_NEEDMORE_P(ret)) {
11160 break;
11161 }
11162 else if (MBCLEN_CHARFOUND_P(ret)) {
11164 p += MBCLEN_CHARFOUND_LEN(ret);
11165 }
11166 else if (MBCLEN_INVALID_P(ret)) {
11167 /*
11168 * p1~p: valid ascii/multibyte chars
11169 * p ~e: invalid bytes + unknown bytes
11170 */
11171 long clen = rb_enc_mbmaxlen(enc);
11172 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11173 if (p > p1) {
11174 rb_str_buf_cat(buf, p1, p - p1);
11175 }
11176
11177 if (e - p < clen) clen = e - p;
11178 if (clen <= 2) {
11179 clen = 1;
11180 }
11181 else {
11182 const char *q = p;
11183 clen--;
11184 for (; clen > 1; clen--) {
11185 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11186 if (MBCLEN_NEEDMORE_P(ret)) break;
11187 if (MBCLEN_INVALID_P(ret)) continue;
11189 }
11190 }
11191 if (rep) {
11192 rb_str_buf_cat(buf, rep, replen);
11193 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11194 }
11195 else {
11196 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11197 str_mod_check(str, sp, slen);
11198 repl = str_compat_and_valid(repl, enc);
11199 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11202 }
11203 p += clen;
11204 p1 = p;
11205 p = search_nonascii(p, e);
11206 if (!p) {
11207 p = e;
11208 break;
11209 }
11210 }
11211 else {
11213 }
11214 }
11215 if (NIL_P(buf)) {
11216 if (p == e) {
11217 ENC_CODERANGE_SET(str, cr);
11218 return Qnil;
11219 }
11220 buf = rb_str_buf_new(RSTRING_LEN(str));
11221 }
11222 if (p1 < p) {
11223 rb_str_buf_cat(buf, p1, p - p1);
11224 }
11225 if (p < e) {
11226 if (rep) {
11227 rb_str_buf_cat(buf, rep, replen);
11228 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11229 }
11230 else {
11231 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11232 str_mod_check(str, sp, slen);
11233 repl = str_compat_and_valid(repl, enc);
11234 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11237 }
11238 }
11239 }
11240 else {
11241 /* ASCII incompatible */
11242 long mbminlen = rb_enc_mbminlen(enc);
11243 if (!replen) {
11244 rep = NULL;
11245 }
11246 else if (!NIL_P(repl)) {
11247 rep = RSTRING_PTR(repl);
11248 replen = RSTRING_LEN(repl);
11249 }
11250 else if (encidx == ENCINDEX_UTF_16BE) {
11251 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11252 }
11253 else if (encidx == ENCINDEX_UTF_16LE) {
11254 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11255 }
11256 else if (encidx == ENCINDEX_UTF_32BE) {
11257 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11258 }
11259 else if (encidx == ENCINDEX_UTF_32LE) {
11260 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11261 }
11262 else {
11263 DEFAULT_REPLACE_CHAR("?");
11264 }
11265
11266 while (p < e) {
11267 int ret = rb_enc_precise_mbclen(p, e, enc);
11268 if (MBCLEN_NEEDMORE_P(ret)) {
11269 break;
11270 }
11271 else if (MBCLEN_CHARFOUND_P(ret)) {
11272 p += MBCLEN_CHARFOUND_LEN(ret);
11273 }
11274 else if (MBCLEN_INVALID_P(ret)) {
11275 const char *q = p;
11276 long clen = rb_enc_mbmaxlen(enc);
11277 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11278 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11279
11280 if (e - p < clen) clen = e - p;
11281 if (clen <= mbminlen * 2) {
11282 clen = mbminlen;
11283 }
11284 else {
11285 clen -= mbminlen;
11286 for (; clen > mbminlen; clen-=mbminlen) {
11287 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11288 if (MBCLEN_NEEDMORE_P(ret)) break;
11289 if (MBCLEN_INVALID_P(ret)) continue;
11291 }
11292 }
11293 if (rep) {
11294 rb_str_buf_cat(buf, rep, replen);
11295 }
11296 else {
11297 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11298 str_mod_check(str, sp, slen);
11299 repl = str_compat_and_valid(repl, enc);
11300 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11301 }
11302 p += clen;
11303 p1 = p;
11304 }
11305 else {
11307 }
11308 }
11309 if (NIL_P(buf)) {
11310 if (p == e) {
11312 return Qnil;
11313 }
11314 buf = rb_str_buf_new(RSTRING_LEN(str));
11315 }
11316 if (p1 < p) {
11317 rb_str_buf_cat(buf, p1, p - p1);
11318 }
11319 if (p < e) {
11320 if (rep) {
11321 rb_str_buf_cat(buf, rep, replen);
11322 }
11323 else {
11324 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11325 str_mod_check(str, sp, slen);
11326 repl = str_compat_and_valid(repl, enc);
11327 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11328 }
11329 }
11331 }
11332 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11333 return buf;
11334}
11335
11336/*
11337 * call-seq:
11338 * scrub(replacement_string = default_replacement) -> new_string
11339 * scrub{|bytes| ... } -> new_string
11340 *
11341 * :include: doc/string/scrub.rdoc
11342 *
11343 */
11344static VALUE
11345str_scrub(int argc, VALUE *argv, VALUE str)
11346{
11347 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11348 VALUE new = rb_str_scrub(str, repl);
11349 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11350}
11351
11352/*
11353 * call-seq:
11354 * scrub! -> self
11355 * scrub!(replacement_string = default_replacement) -> self
11356 * scrub!{|bytes| ... } -> self
11357 *
11358 * Like String#scrub, except that any replacements are made in +self+.
11359 *
11360 */
11361static VALUE
11362str_scrub_bang(int argc, VALUE *argv, VALUE str)
11363{
11364 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11365 VALUE new = rb_str_scrub(str, repl);
11366 if (!NIL_P(new)) rb_str_replace(str, new);
11367 return str;
11368}
11369
11370static ID id_normalize;
11371static ID id_normalized_p;
11372static VALUE mUnicodeNormalize;
11373
11374static VALUE
11375unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11376{
11377 static int UnicodeNormalizeRequired = 0;
11378 VALUE argv2[2];
11379
11380 if (!UnicodeNormalizeRequired) {
11381 rb_require("unicode_normalize/normalize.rb");
11382 UnicodeNormalizeRequired = 1;
11383 }
11384 argv2[0] = str;
11385 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11386 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11387}
11388
11389/*
11390 * call-seq:
11391 * unicode_normalize(form = :nfc) -> string
11392 *
11393 * Returns a copy of +self+ with
11394 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11395 *
11396 * Argument +form+ must be one of the following symbols
11397 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11398 *
11399 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11400 * - +:nfd+: Canonical decomposition.
11401 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11402 * - +:nfkd+: Compatibility decomposition.
11403 *
11404 * The encoding of +self+ must be one of:
11405 *
11406 * - Encoding::UTF_8
11407 * - Encoding::UTF_16BE
11408 * - Encoding::UTF_16LE
11409 * - Encoding::UTF_32BE
11410 * - Encoding::UTF_32LE
11411 * - Encoding::GB18030
11412 * - Encoding::UCS_2BE
11413 * - Encoding::UCS_4BE
11414 *
11415 * Examples:
11416 *
11417 * "a\u0300".unicode_normalize # => "a"
11418 * "\u00E0".unicode_normalize(:nfd) # => "a "
11419 *
11420 * Related: String#unicode_normalize!, String#unicode_normalized?.
11421 */
11422static VALUE
11423rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11424{
11425 return unicode_normalize_common(argc, argv, str, id_normalize);
11426}
11427
11428/*
11429 * call-seq:
11430 * unicode_normalize!(form = :nfc) -> self
11431 *
11432 * Like String#unicode_normalize, except that the normalization
11433 * is performed on +self+.
11434 *
11435 * Related String#unicode_normalized?.
11436 *
11437 */
11438static VALUE
11439rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11440{
11441 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11442}
11443
11444/* call-seq:
11445 * unicode_normalized?(form = :nfc) -> true or false
11446 *
11447 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11448 * +false+ otherwise.
11449 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11450 *
11451 * Examples:
11452 *
11453 * "a\u0300".unicode_normalized? # => false
11454 * "a\u0300".unicode_normalized?(:nfd) # => true
11455 * "\u00E0".unicode_normalized? # => true
11456 * "\u00E0".unicode_normalized?(:nfd) # => false
11457 *
11458 *
11459 * Raises an exception if +self+ is not in a Unicode encoding:
11460 *
11461 * s = "\xE0".force_encoding('ISO-8859-1')
11462 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11463 *
11464 * Related: String#unicode_normalize, String#unicode_normalize!.
11465 *
11466 */
11467static VALUE
11468rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11469{
11470 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11471}
11472
11473/**********************************************************************
11474 * Document-class: Symbol
11475 *
11476 * \Symbol objects represent named identifiers inside the Ruby interpreter.
11477 *
11478 * You can create a \Symbol object explicitly with:
11479 *
11480 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11481 *
11482 * The same \Symbol object will be
11483 * created for a given name or string for the duration of a program's
11484 * execution, regardless of the context or meaning of that name. Thus
11485 * if <code>Fred</code> is a constant in one context, a method in
11486 * another, and a class in a third, the \Symbol <code>:Fred</code>
11487 * will be the same object in all three contexts.
11488 *
11489 * module One
11490 * class Fred
11491 * end
11492 * $f1 = :Fred
11493 * end
11494 * module Two
11495 * Fred = 1
11496 * $f2 = :Fred
11497 * end
11498 * def Fred()
11499 * end
11500 * $f3 = :Fred
11501 * $f1.object_id #=> 2514190
11502 * $f2.object_id #=> 2514190
11503 * $f3.object_id #=> 2514190
11504 *
11505 * Constant, method, and variable names are returned as symbols:
11506 *
11507 * module One
11508 * Two = 2
11509 * def three; 3 end
11510 * @four = 4
11511 * @@five = 5
11512 * $six = 6
11513 * end
11514 * seven = 7
11515 *
11516 * One.constants
11517 * # => [:Two]
11518 * One.instance_methods(true)
11519 * # => [:three]
11520 * One.instance_variables
11521 * # => [:@four]
11522 * One.class_variables
11523 * # => [:@@five]
11524 * global_variables.grep(/six/)
11525 * # => [:$six]
11526 * local_variables
11527 * # => [:seven]
11528 *
11529 * \Symbol objects are different from String objects in that
11530 * \Symbol objects represent identifiers, while String objects
11531 * represent text or data.
11532 *
11533 * == What's Here
11534 *
11535 * First, what's elsewhere. \Class \Symbol:
11536 *
11537 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11538 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11539 *
11540 * Here, class \Symbol provides methods that are useful for:
11541 *
11542 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11543 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11544 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11545 *
11546 * === Methods for Querying
11547 *
11548 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11549 * - #=~: Returns the index of the first substring in symbol that matches a
11550 * given Regexp or other object; returns +nil+ if no match is found.
11551 * - #[], #slice : Returns a substring of symbol
11552 * determined by a given index, start/length, or range, or string.
11553 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11554 * - #encoding: Returns the Encoding object that represents the encoding
11555 * of symbol.
11556 * - #end_with?: Returns +true+ if symbol ends with
11557 * any of the given strings.
11558 * - #match: Returns a MatchData object if symbol
11559 * matches a given Regexp; +nil+ otherwise.
11560 * - #match?: Returns +true+ if symbol
11561 * matches a given Regexp; +false+ otherwise.
11562 * - #length, #size: Returns the number of characters in symbol.
11563 * - #start_with?: Returns +true+ if symbol starts with
11564 * any of the given strings.
11565 *
11566 * === Methods for Comparing
11567 *
11568 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11569 * or larger than symbol.
11570 * - #==, #===: Returns +true+ if a given symbol has the same content and
11571 * encoding.
11572 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11573 * symbol is smaller than, equal to, or larger than symbol.
11574 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11575 * after Unicode case folding; +false+ otherwise.
11576 *
11577 * === Methods for Converting
11578 *
11579 * - #capitalize: Returns symbol with the first character upcased
11580 * and all other characters downcased.
11581 * - #downcase: Returns symbol with all characters downcased.
11582 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11583 * - #name: Returns the frozen string corresponding to symbol.
11584 * - #succ, #next: Returns the symbol that is the successor to symbol.
11585 * - #swapcase: Returns symbol with all upcase characters downcased
11586 * and all downcase characters upcased.
11587 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11588 * - #to_s, #id2name: Returns the string corresponding to +self+.
11589 * - #to_sym, #intern: Returns +self+.
11590 * - #upcase: Returns symbol with all characters upcased.
11591 *
11592 */
11593
11594
11595/*
11596 * call-seq:
11597 * symbol == object -> true or false
11598 *
11599 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11600 */
11601
11602#define sym_equal rb_obj_equal
11603
11604static int
11605sym_printable(const char *s, const char *send, rb_encoding *enc)
11606{
11607 while (s < send) {
11608 int n;
11609 int c = rb_enc_precise_mbclen(s, send, enc);
11610
11611 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11612 n = MBCLEN_CHARFOUND_LEN(c);
11613 c = rb_enc_mbc_to_codepoint(s, send, enc);
11614 if (!rb_enc_isprint(c, enc)) return FALSE;
11615 s += n;
11616 }
11617 return TRUE;
11618}
11619
11620int
11621rb_str_symname_p(VALUE sym)
11622{
11623 rb_encoding *enc;
11624 const char *ptr;
11625 long len;
11626 rb_encoding *resenc = rb_default_internal_encoding();
11627
11628 if (resenc == NULL) resenc = rb_default_external_encoding();
11629 enc = STR_ENC_GET(sym);
11630 ptr = RSTRING_PTR(sym);
11631 len = RSTRING_LEN(sym);
11632 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11633 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11634 return FALSE;
11635 }
11636 return TRUE;
11637}
11638
11639VALUE
11640rb_str_quote_unprintable(VALUE str)
11641{
11642 rb_encoding *enc;
11643 const char *ptr;
11644 long len;
11645 rb_encoding *resenc;
11646
11647 Check_Type(str, T_STRING);
11648 resenc = rb_default_internal_encoding();
11649 if (resenc == NULL) resenc = rb_default_external_encoding();
11650 enc = STR_ENC_GET(str);
11651 ptr = RSTRING_PTR(str);
11652 len = RSTRING_LEN(str);
11653 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11654 !sym_printable(ptr, ptr + len, enc)) {
11655 return rb_str_escape(str);
11656 }
11657 return str;
11658}
11659
11660VALUE
11661rb_id_quote_unprintable(ID id)
11662{
11663 VALUE str = rb_id2str(id);
11664 if (!rb_str_symname_p(str)) {
11665 return rb_str_escape(str);
11666 }
11667 return str;
11668}
11669
11670/*
11671 * call-seq:
11672 * inspect -> string
11673 *
11674 * Returns a string representation of +self+ (including the leading colon):
11675 *
11676 * :foo.inspect # => ":foo"
11677 *
11678 * Related: Symbol#to_s, Symbol#name.
11679 *
11680 */
11681
11682static VALUE
11683sym_inspect(VALUE sym)
11684{
11685 VALUE str = rb_sym2str(sym);
11686 const char *ptr;
11687 long len;
11688 char *dest;
11689
11690 if (!rb_str_symname_p(str)) {
11691 str = rb_str_inspect(str);
11692 len = RSTRING_LEN(str);
11693 rb_str_resize(str, len + 1);
11694 dest = RSTRING_PTR(str);
11695 memmove(dest + 1, dest, len);
11696 }
11697 else {
11698 rb_encoding *enc = STR_ENC_GET(str);
11699
11700 VALUE orig_str = str;
11701 RSTRING_GETMEM(orig_str, ptr, len);
11702
11703 str = rb_enc_str_new(0, len + 1, enc);
11704 dest = RSTRING_PTR(str);
11705 memcpy(dest + 1, ptr, len);
11706
11707 RB_GC_GUARD(orig_str);
11708 }
11709 dest[0] = ':';
11710 return str;
11711}
11712
11713/*
11714 * call-seq:
11715 * to_s -> string
11716 *
11717 * Returns a string representation of +self+ (not including the leading colon):
11718 *
11719 * :foo.to_s # => "foo"
11720 *
11721 * Related: Symbol#inspect, Symbol#name.
11722 */
11723
11724VALUE
11726{
11727 return str_new_shared(rb_cString, rb_sym2str(sym));
11728}
11729
11730VALUE
11731rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11732{
11733 VALUE obj;
11734
11735 if (argc < 1) {
11736 rb_raise(rb_eArgError, "no receiver given");
11737 }
11738 obj = argv[0];
11739 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11740}
11741
11742/*
11743 * call-seq:
11744 * succ
11745 *
11746 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11747 *
11748 * :foo.succ # => :fop
11749 *
11750 * Related: String#succ.
11751 */
11752
11753static VALUE
11754sym_succ(VALUE sym)
11755{
11756 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11757}
11758
11759/*
11760 * call-seq:
11761 * symbol <=> object -> -1, 0, +1, or nil
11762 *
11763 * If +object+ is a symbol,
11764 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11765 *
11766 * :bar <=> :foo # => -1
11767 * :foo <=> :foo # => 0
11768 * :foo <=> :bar # => 1
11769 *
11770 * Otherwise, returns +nil+:
11771 *
11772 * :foo <=> 'bar' # => nil
11773 *
11774 * Related: String#<=>.
11775 */
11776
11777static VALUE
11778sym_cmp(VALUE sym, VALUE other)
11779{
11780 if (!SYMBOL_P(other)) {
11781 return Qnil;
11782 }
11783 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11784}
11785
11786/*
11787 * call-seq:
11788 * casecmp(object) -> -1, 0, 1, or nil
11789 *
11790 * :include: doc/symbol/casecmp.rdoc
11791 *
11792 */
11793
11794static VALUE
11795sym_casecmp(VALUE sym, VALUE other)
11796{
11797 if (!SYMBOL_P(other)) {
11798 return Qnil;
11799 }
11800 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11801}
11802
11803/*
11804 * call-seq:
11805 * casecmp?(object) -> true, false, or nil
11806 *
11807 * :include: doc/symbol/casecmp_p.rdoc
11808 *
11809 */
11810
11811static VALUE
11812sym_casecmp_p(VALUE sym, VALUE other)
11813{
11814 if (!SYMBOL_P(other)) {
11815 return Qnil;
11816 }
11817 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11818}
11819
11820/*
11821 * call-seq:
11822 * symbol =~ object -> integer or nil
11823 *
11824 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11825 * including possible updates to global variables;
11826 * see String#=~.
11827 *
11828 */
11829
11830static VALUE
11831sym_match(VALUE sym, VALUE other)
11832{
11833 return rb_str_match(rb_sym2str(sym), other);
11834}
11835
11836/*
11837 * call-seq:
11838 * match(pattern, offset = 0) -> matchdata or nil
11839 * match(pattern, offset = 0) {|matchdata| } -> object
11840 *
11841 * Equivalent to <tt>self.to_s.match</tt>,
11842 * including possible updates to global variables;
11843 * see String#match.
11844 *
11845 */
11846
11847static VALUE
11848sym_match_m(int argc, VALUE *argv, VALUE sym)
11849{
11850 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11851}
11852
11853/*
11854 * call-seq:
11855 * match?(pattern, offset) -> true or false
11856 *
11857 * Equivalent to <tt>sym.to_s.match?</tt>;
11858 * see String#match.
11859 *
11860 */
11861
11862static VALUE
11863sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11864{
11865 return rb_str_match_m_p(argc, argv, sym);
11866}
11867
11868/*
11869 * call-seq:
11870 * symbol[index] -> string or nil
11871 * symbol[start, length] -> string or nil
11872 * symbol[range] -> string or nil
11873 * symbol[regexp, capture = 0] -> string or nil
11874 * symbol[substring] -> string or nil
11875 *
11876 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11877 *
11878 */
11879
11880static VALUE
11881sym_aref(int argc, VALUE *argv, VALUE sym)
11882{
11883 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11884}
11885
11886/*
11887 * call-seq:
11888 * length -> integer
11889 *
11890 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11891 */
11892
11893static VALUE
11894sym_length(VALUE sym)
11895{
11896 return rb_str_length(rb_sym2str(sym));
11897}
11898
11899/*
11900 * call-seq:
11901 * empty? -> true or false
11902 *
11903 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11904 *
11905 */
11906
11907static VALUE
11908sym_empty(VALUE sym)
11909{
11910 return rb_str_empty(rb_sym2str(sym));
11911}
11912
11913/*
11914 * call-seq:
11915 * upcase(*options) -> symbol
11916 *
11917 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11918 *
11919 * See String#upcase.
11920 *
11921 */
11922
11923static VALUE
11924sym_upcase(int argc, VALUE *argv, VALUE sym)
11925{
11926 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11927}
11928
11929/*
11930 * call-seq:
11931 * downcase(*options) -> symbol
11932 *
11933 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11934 *
11935 * See String#downcase.
11936 *
11937 * Related: Symbol#upcase.
11938 *
11939 */
11940
11941static VALUE
11942sym_downcase(int argc, VALUE *argv, VALUE sym)
11943{
11944 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11945}
11946
11947/*
11948 * call-seq:
11949 * capitalize(*options) -> symbol
11950 *
11951 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11952 *
11953 * See String#capitalize.
11954 *
11955 */
11956
11957static VALUE
11958sym_capitalize(int argc, VALUE *argv, VALUE sym)
11959{
11960 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11961}
11962
11963/*
11964 * call-seq:
11965 * swapcase(*options) -> symbol
11966 *
11967 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11968 *
11969 * See String#swapcase.
11970 *
11971 */
11972
11973static VALUE
11974sym_swapcase(int argc, VALUE *argv, VALUE sym)
11975{
11976 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11977}
11978
11979/*
11980 * call-seq:
11981 * start_with?(*string_or_regexp) -> true or false
11982 *
11983 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11984 *
11985 */
11986
11987static VALUE
11988sym_start_with(int argc, VALUE *argv, VALUE sym)
11989{
11990 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11991}
11992
11993/*
11994 * call-seq:
11995 * end_with?(*strings) -> true or false
11996 *
11997 *
11998 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11999 *
12000 */
12001
12002static VALUE
12003sym_end_with(int argc, VALUE *argv, VALUE sym)
12004{
12005 return rb_str_end_with(argc, argv, rb_sym2str(sym));
12006}
12007
12008/*
12009 * call-seq:
12010 * encoding -> encoding
12011 *
12012 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
12013 *
12014 */
12015
12016static VALUE
12017sym_encoding(VALUE sym)
12018{
12019 return rb_obj_encoding(rb_sym2str(sym));
12020}
12021
12022static VALUE
12023string_for_symbol(VALUE name)
12024{
12025 if (!RB_TYPE_P(name, T_STRING)) {
12026 VALUE tmp = rb_check_string_type(name);
12027 if (NIL_P(tmp)) {
12028 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
12029 name);
12030 }
12031 name = tmp;
12032 }
12033 return name;
12034}
12035
12036ID
12038{
12039 if (SYMBOL_P(name)) {
12040 return SYM2ID(name);
12041 }
12042 name = string_for_symbol(name);
12043 return rb_intern_str(name);
12044}
12045
12046VALUE
12048{
12049 if (SYMBOL_P(name)) {
12050 return name;
12051 }
12052 name = string_for_symbol(name);
12053 return rb_str_intern(name);
12054}
12055
12056/*
12057 * call-seq:
12058 * Symbol.all_symbols -> array_of_symbols
12059 *
12060 * Returns an array of all symbols currently in Ruby's symbol table:
12061 *
12062 * Symbol.all_symbols.size # => 9334
12063 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
12064 *
12065 */
12066
12067static VALUE
12068sym_all_symbols(VALUE _)
12069{
12070 return rb_sym_all_symbols();
12071}
12072
12073VALUE
12075{
12076 return rb_fstring(str);
12077}
12078
12079VALUE
12080rb_interned_str(const char *ptr, long len)
12081{
12082 struct RString fake_str;
12083 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12084}
12085
12086VALUE
12088{
12089 return rb_interned_str(ptr, strlen(ptr));
12090}
12091
12092VALUE
12093rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12094{
12095 if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) {
12096 rb_enc_autoload(enc);
12097 }
12098
12099 struct RString fake_str;
12100 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12101}
12102
12103VALUE
12105{
12106 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12107}
12108
12109void
12110Init_String(void)
12111{
12112 rb_cString = rb_define_class("String", rb_cObject);
12113 assert(rb_vm_fstring_table());
12114 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12116 rb_define_alloc_func(rb_cString, empty_str_alloc);
12117 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12118 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12119 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12120 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12121 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12124 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12125 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12126 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12127 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12130 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12131 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12132 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12133 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12136 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12137 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12138 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12139 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12140 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12142 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12144 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12145 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12146 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12147 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12148 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12149 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12151 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12152 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12153 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12154 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12155 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12156 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12157 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12158 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12160 rb_define_method(rb_cString, "+@", str_uplus, 0);
12161 rb_define_method(rb_cString, "-@", str_uminus, 0);
12162 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12163 rb_define_alias(rb_cString, "dedup", "-@");
12164
12165 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12166 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12167 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12168 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12171 rb_define_method(rb_cString, "undump", str_undump, 0);
12172
12173 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12174 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12175 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12176 sym_fold = ID2SYM(rb_intern_const("fold"));
12177
12178 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12179 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12180 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12181 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12182
12183 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12184 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12185 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12186 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12187
12188 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12189 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12190 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12191 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12192 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12193 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12194 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12195 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12196 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12197 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12198 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12200 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12201 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12202 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12203 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12204 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12205
12206 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12207 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12208 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12209
12210 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12211
12212 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12213 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12214 rb_define_method(rb_cString, "center", rb_str_center, -1);
12215
12216 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12217 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12218 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12219 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12220 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12221 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12222 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12223 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12224 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12225
12226 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12227 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12228 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12229 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12230 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12231 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12232 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12233 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12234 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12235
12236 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12237 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12238 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12239 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12240 rb_define_method(rb_cString, "count", rb_str_count, -1);
12241
12242 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12243 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12244 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12245 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12246
12247 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12248 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12249 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12250 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12251 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12252
12253 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12254
12255 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12256 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12257
12258 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12259 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12260
12261 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12262 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12263 rb_define_method(rb_cString, "b", rb_str_b, 0);
12264 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12265 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12266
12267 /* define UnicodeNormalize module here so that we don't have to look it up */
12268 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12269 id_normalize = rb_intern_const("normalize");
12270 id_normalized_p = rb_intern_const("normalized?");
12271
12272 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12273 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12274 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12275
12276 rb_fs = Qnil;
12277 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12278 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12279 rb_gc_register_address(&rb_fs);
12280
12281 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12285 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12286
12287 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12288 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12289 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12291 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12292 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12293 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12294 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12295 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12296
12297 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12298 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12299 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12300 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12301
12302 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12303 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12304 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12305 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12306 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12307 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12308 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12309
12310 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12311 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12312 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12313 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12314
12315 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12316 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12317
12318 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12319}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1177
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:970
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1085
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2336
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2160
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2626
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:866
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2415
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define NEWOBJ_OF
Old name of RB_NEWOBJ_OF.
Definition newobj.h:61
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:136
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:516
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:517
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:518
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:515
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:433
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3567
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1348
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1344
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1351
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1342
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1346
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:634
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2058
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2076
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1237
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3431
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:215
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:541
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:147
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3145
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:619
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:682
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:703
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:570
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:446
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:98
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:590
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:431
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:618
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:725
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1149
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:769
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1015
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2762
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1034
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12093
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:252
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2106
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:962
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1254
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1155
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:781
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12104
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:653
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1121
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1208
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:495
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:538
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1793
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1020
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1799
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1744
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1235
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4177
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3674
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1441
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1884
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:12074
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1538
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1318
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2257
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3414
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1230
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11725
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2329
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "defaultexternal" encoding.
Definition string.c:1206
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1532
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2790
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4862
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3634
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:11018
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1747
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1585
#define rb_str_buf_cat
Just another name of rb_str_cat
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:997
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:815
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3623
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2195
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1802
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6069
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2895
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12087
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "defaultexternal" encoding.
Definition string.h:1604
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2837
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3736
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6781
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2535
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12080
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3690
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3505
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3665
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3356
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:3004
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5372
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11076
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1488
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2686
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2982
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3075
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1009
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2491
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6895
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1218
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2209
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5290
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8973
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1003
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2937
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1274
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:953
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:12047
ID rb_to_id(VALUE str)
Definition string.c:12037
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1823
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3458
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4421
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1376
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1248
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2663
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2547
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1242
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2558
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1576
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1394
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:196
union RString::@50 as
String's specific fields.
struct RString::@50::@51 heap
Strings that use separated memory region for contents use this pattern.
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
struct RString::@50::@52 embed
Embedded contents.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
union RString::@50::@51::@53 aux
Auxiliary info.
VALUE shared
Parent of the string.
Definition rstring.h:240
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:7853
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:296
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432