#include #include "token.h" /******************************************************************************/ GetTokens(SecPtr, PostInfo) /******************************************************************************/ char *SecPtr; /* pointer to secton buffer */ POSTINFO *PostInfo; /* pointer to index term information */ { char TokBuf[cMAXTOKENLEN]; /* token buffer */ char *SecPtrOrg; /* orginal pointer to section */ char *TokPtrOrg; /* orginal pointer to token */ int CharType; /* character type */ int PostInfoCnt = 0; /* index term count */ int WordNum = 0; /* token serial number to section */ char UseAsIndex; SecPtrOrg = SecPtr; while ( *SecPtr ) { TokPtrOrg = SecPtr; CharType = getToken(&SecPtr,TokBuf); if ( !strlen(TokBuf) ) continue; switch ( CharType ) { case T_KO: /* ÇÑ±Û */ case T_EN: /* ¿µ¾î */ case T_HJ: /* ÇÑÀÚ */ case T_JP: /* ÀϾî */ case T_DG: /* ¼ýÀÚ */ if ( PostInfoCnt >= cMAXPOSTINFOSIZE ) return ( cMAXPOSTINFOSIZE ); UseAsIndex = TokGrp[CharType].TokGrp; if (UseAsIndex == '1') { strcpy(PostInfo[PostInfoCnt].Key, TokBuf); if (CharType == T_EN) { strlower(PostInfo[PostInfoCnt].Key); } PostInfo[PostInfoCnt].KeyLen = strlen(PostInfo[PostInfoCnt].Key); if (PostInfo[PostInfoCnt].KeyLen <= 0) break; PostInfo[PostInfoCnt].WordNum = ++WordNum; if ( PostInfo[PostInfoCnt].KeyLen > cMAXTOKENLEN ) PostInfo[PostInfoCnt].KeyLen = cMAXTOKENLEN; PostInfoCnt++; } break; case T_SC: case T_CK: case T_U1: case T_U2: ++WordNum; } } return(PostInfoCnt); } /******************************************************************************/ int getToken(SecPtr,TokBuf) /******************************************************************************/ char **SecPtr, *TokBuf; { int i = 0; int PrevCharType; int CurrCharType; char NextCharType; PrevCharType = cType1[(unsigned char)**SecPtr]; while ( **SecPtr ) { CurrCharType = (unsigned char)**SecPtr; NextCharType = (unsigned char)*(*SecPtr+1); switch ( cType1[CurrCharType] ) { case T_EN: /* English */ if ( PrevCharType == T_EN ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); case T_DG: /* Digit */ if ( PrevCharType == T_DG ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); case T_SC: /* Special Character */ if ( PrevCharType == T_SC ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); case T_CK: /* Control Key */ if ( PrevCharType == T_CK ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; TokBuf[i] = '\0'; return( PrevCharType ); } TokBuf[i] = '\0'; return( PrevCharType ); case T_BL: /* Blank */ if ( PrevCharType == T_BL ) { ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); case T_U1: /* Unknown character (1 byte)*/ if ( PrevCharType == T_U1 ) { ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); case T_KO: /* Korean */ if ( NextCharType & 0x80 ) /* Á¤»ó */ { if ( PrevCharType == T_KO ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); } else /* Äڵ尡 ±úÁ³À¸¸é */ { ++*SecPtr; TokBuf[i] = '\0'; return( PrevCharType ); } case T_HJ: /* Hanja */ if ( NextCharType & 0x80 ) /* Á¤»ó */ { if ( PrevCharType == T_HJ ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); } else /* Äڵ尡 ±úÁ³À¸¸é */ { ++*SecPtr; TokBuf[i] = '\0'; return( PrevCharType ); } case T_JP: /* Japanse */ if ( NextCharType & 0x80 ) /* Á¤»ó */ { if ( PrevCharType == T_JP ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); } else /* Äڵ尡 ±úÁ³À¸¸é */ { ++*SecPtr; TokBuf[i] = '\0'; return( PrevCharType ); } case T_U2: /* unknown character (2 byte) */ if ( NextCharType & 0x80 ) /* Á¤»ó */ { if ( PrevCharType == T_U2 ) { if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; if (i < cMAXTOKENLEN) TokBuf[i++] = **SecPtr; ++*SecPtr; break; } TokBuf[i] = '\0'; return( PrevCharType ); } else /* Äڵ尡 ±úÁ³À¸¸é */ { ++*SecPtr; TokBuf[i] = '\0'; return( PrevCharType ); } } } TokBuf[i] = '\0'; return( PrevCharType ); } /******************************************************************************/