/*
* call-seq:
* Token.new(text, start, end, pos_inc = 1) -> new Token
*
* Creates a new token setting the text, start and end offsets of the token
* and the position increment for the token.
*
* The position increment is usually set to 1 but you can set it to other
* values as needed. For example, if you have a stop word filter you will be
* skipping tokens. Let's say you have the stop words "the" and "and" and you
* parse the title "The Old Man and the Sea". The terms "Old", "Man" and
* "Sea" will have the position increments 2, 1 and 3 respectively.
*
* Another reason you might want to vary the position increment is if you are
* adding synonyms to the index. For example let's say you have the synonym
* group "quick", "fast" and "speedy". When tokenizing the phrase "Next day
* speedy delivery", you'll add "speedy" first with a position increment of 1
* and then "fast" and "quick" with position increments of 0 since they are
* represented in the same position.
*
* The offset set values +start+ and +end+ should be byte offsets, not
* character offsets. This makes it easy to use those offsets to quickly
* access the token in the input string and also to insert highlighting tags
* when necessary.
*
* text:: the main text for the token.
* start:: the start offset of the token in bytes.
* end:: the end offset of the token in bytes.
* pos_inc:: the position increment of a token. See above.
* return:: a newly created and assigned Token object
*/
static VALUE
frt_token_init(int argc, VALUE *argv, VALUE self)
{
RToken *token;
VALUE rtext, rstart, rend, rpos_inc, rtype;
GET_TK(token, self);
token->pos_inc = 1;
switch (rb_scan_args(argc, argv, "32", &rtext, &rstart,
&rend, &rpos_inc, &rtype)) {
case 5: /* type gets ignored at this stage */
case 4: token->pos_inc = FIX2INT(rpos_inc);
}
token->text = rb_obj_as_string(rtext);
token->start = FIX2INT(rstart);
token->end = FIX2INT(rend);
return self;
}