/*
* This program reads from STDIN an arbitrary text which will be tokenized.
* A token can be either a word or another non-alphabetic character.
*
* When it finds a '0xFF' char it will parse the seguent char as it was the
* number of repetitions of the following third char:
* "Hi everyb0xFF0x390dy" ---> "Hi everybooooooooody"
*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#define WORD 1
#define PUNCT 2
/* function declarations */
void parse(int);
void got_token(int);
void add_to_token(int);
void close_token(void);
unsigned int words = 0, /* amount of words read from the input text */
puncts = 0, /* amount of punctuation characters, including spaces, etc. */
token_num = 0, /* will be the sum of words + puncts */
length = 0; /* the length of a token in chars (temporary var) */
/* pointer to an area of memory containing pointers to tokens */
int **array_token;
int main(void) {
int c, len;
array_token = malloc(sizeof(int *));
/* decompress/read and store STDIN in tokens */
while (1) {
c = getchar();
if (c == EOF) {
parse(c);
break;
}
if (c == 0xFF) {
len = getchar();
c = getchar();
while (len--)
parse(c);
} else
parse(c);
}
/* eventually, do something with the data... */
printf("Words:\t\t%d\n" /* total amount of words read */
"Puncts:\t\t%d\n" /* total amount of punctuation chars */
"Tokens:\t\t%d\n" /* number of tokens */
"Arrays used:\t%d\n", /* real number of arrays used */
words,
puncts,
words + puncts,
token_num + 1);
/* DOESN't WORK: print out every token */
c = token_num;
puts("\nThese are the tokens:\n");
do {
printf("%s\n", (char *)array_token[c]);
} while (c--);
/* free used memory!! */
do {
free(array_token[token_num]);
} while (token_num--);
free(array_token);
return 0;
}
void parse(int c) {
static enum {
START, IN_WORD
} state;
if (c == EOF) {
if (state == IN_WORD)
words++;
else {
free(array_token[token_num]);
if (token_num > 0)
token_num--;
}
close_token();
return;
}
switch (state) {
case IN_WORD:
if (isalpha(c)) {
add_to_token(c);
return;
}
got_token(WORD);
state = START;
/* fall through */
case START:
add_to_token(c);
if (isalpha(c))
state = IN_WORD;
else
got_token(PUNCT);
break;
}
}
void got_token(int type) {
switch (type) {
case WORD:
words++;
break;
case PUNCT:
puncts++;
break;
}
close_token();
array_token = realloc(array_token, (++token_num + 1) * sizeof(int *)); /* new token */
}
void add_to_token(int c) {
/* We should have already inizialized array_token. Let's be safe... */
if (array_token == NULL)
array_token = malloc(sizeof(int *));
/* inizialize the array to store the token if non-existent */
if (array_token[token_num] == NULL) {
length = 0;
array_token[token_num] = malloc(sizeof(int));
array_token[token_num][length] = c;
}
/* otherwise, expand its size in memory */
else {
array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
array_token[token_num][length] = c;
}
}
/* this is an optional function... */
void close_token() {
/* realloc the array, expanding its size, in order to add '\0' at the end */
array_token[token_num] = realloc(array_token[token_num], (++length + 1) * sizeof(int));
array_token[token_num][length] = '\0';
}