/* GlOSS-Collector Daemon * author : Brian Lent * created: January 26, 1994 * revised: May 15, 1994 * revised: September 1994 by Luis Gravano * COPYRIGHT (c) Stanford University, 1994. */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* WAIS headers */ #include "irfiles.h" #include "cutil.h" #include "futil.h" #include "version.h" #ifndef FIELDS #include "irlex.h" /* for MAX_WORD_LENGTH */ #else #include "weight.h" #endif #define COLLECTOR_VERSION "v0.3" /* Account and host where to send email to the GlOSS server */ #define GlOSS_ACCOUNT "gloss" #define GlOSS_EMAIL_HOST "db.stanford.edu" /* Name given by uuencode to the frequency file to be emailed to the GlOSS host */ #define UUENCODE_FILE "collector_freqs_file.Z" /* Definitions for a TCP "client" program */ #define SERV_TCP_PORT 6001 #define GlOSS_HOST "gloss.stanford.edu" #define MAXLINE 1000 /* size of the one-line database description to be associated with each database */ #define MAXDESCRIPTION 81 /* number of times that the collector will try to open a busy database before giving up */ #define MAX_ATTEMPTS 60 /* number of seconds between two consecutive attempts to open a busy database */ #define ATTEMPT_SEP 60 #define EMAIL 'E' #define TCP 'T' #define SEARCHABLE 'S' #define NONSEARCHABLE 'N' #define FORM 'F' #ifndef FILENAME_MAX #define FILENAME_MAX 255 #endif /* default values for some of the input parameters to the collector */ #define DEFAULT_HOUR 1 #define DEFAULT_DAY 7 /* some type definitions adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. */ #ifndef docID #define docID unsigned long #endif #define mypostingWeight float typedef struct serialPostingFile{ FILE* stream; long length; long current_index_block; } serialPostingFile; typedef struct postingsForATerm{ char word[MAX_WORD_LENGTH + 1]; unsigned long entries; docID* docs; mypostingWeight* weights; } postingsForATerm; /*----------------------------------------------------------------------*/ /* HEADERS */ /*----------------------------------------------------------------------*/ void InitializeParameters(int, char **, char *, char *, char *, int *, int *, char *, char *, boolean *, char *, char *, FILE **); int ScanInput(int, char **, char *, char *, char *, int *, int *, char *, char *, boolean *); void PrintUsage(char *); void PrintParameters(char *, char *, char, int, int, char *, char); FILE *CreateErrorLog(char *, char *, char *, char, int, int, char *, char); FILE *WriteHeader(char *, char *, char *, char, int, int, char *, char); void GatherAndSendFreqs(char *, char *, char, int, int, char *, char, char *, FILE **); void GenerateFreqFile(char *, char *, char, int, int, char *, char, char *, FILE **); database *OpenDB(char *); void Panic(char *); void RemoveTemps(); serialPostingFile *initSerialPostingFile(char *); postingsForATerm *getPostingsForNextTerm(serialPostingFile *); void readDictionaryIndexBlock(char *, FILE *); void readPostings(serialPostingFile *, postingsForATerm *, long); void countPostingsForATerm(postingsForATerm *, FILE *, long *); void disposePostingsForATerm(postingsForATerm *); void disposeSerialPostingFile(serialPostingFile *); void doTCP_IP(char *); int OpenTCP_Connection(); void TransmitFile(char *, int); void doMail(char *); void ScheduleNextExecution(int, char **, int, int); #ifndef bzero extern void bzero(); #endif extern int socket(); extern int connect(); extern clock_t times(); #ifndef htons extern unsigned short htons(); #endif /*----------------------------------------------------------------------*/ /* MAIN */ /*----------------------------------------------------------------------*/ char *log_file_name; FILE *logfile; FILE *errorlog = (FILE *)NULL; char errorlog_fn[L_tmpnam]; /* The temporary error-log file name string */ FILE *info = (FILE *)NULL; char info_fn[L_tmpnam]; /* The temporary work-file name string */ void main(int argc, char **argv) { /* command-line arguments */ char index_fn[FILENAME_MAX]; char url_fn[FILENAME_MAX]; char searchable; int day, hour; char desc_fn[MAXDESCRIPTION]; char conn_method; boolean first_run; InitializeParameters(argc, argv, index_fn, url_fn, &searchable, &day, &hour, desc_fn, &conn_method, &first_run, info_fn, errorlog_fn, &errorlog); /* if this is not the first time the collector runs, schedule next run through the 'at' command before the current frequency gathering */ if(!first_run) ScheduleNextExecution(argc, argv, day, hour); GatherAndSendFreqs(index_fn, url_fn, searchable, day, hour, desc_fn, conn_method, info_fn, &info); /* if this is the first time the collector runs, schedule next run only after the current run has finished successfully, to detect any problems */ if(first_run) ScheduleNextExecution(argc, argv, day, hour); exit(0); } /*----------------------------------------------------------------------*/ /*----------------------------------------------------------------------*/ void InitializeParameters(int argc, char **argv, char *index_fn, char *url_fn, char *searchable, int *day, int *hour, char *desc_fn, char *conn_method, boolean *first_run, char *fn, char *efn, FILE **err){ /* Reads input and initializes various parameters. */ /* initialize name of the temporary files to be used later */ strcpy(fn, ""); strcpy(efn, ""); /* assign default values */ *searchable=SEARCHABLE; *day=DEFAULT_DAY; *hour=DEFAULT_HOUR; strcpy(desc_fn, ""); *conn_method=TCP; *first_run=true; /* this parameter is used to distinguished the first, more prone-to-errors execution of the collector from the following ones, scheduled through the 'at' system call */ /* assign input values to parameters */ if(ScanInput(argc, argv, index_fn, url_fn, searchable, day, hour, desc_fn, conn_method, first_run)<0){ PrintUsage(argv[0]); exit(1); } /* the default description is the db's URL */ if(strcmp(desc_fn, "")==0) strcpy(desc_fn, url_fn); /* display the parameters entered */ PrintParameters(index_fn, url_fn, *searchable, *day, *hour, desc_fn, *conn_method); /* create and initialize error log */ *err=CreateErrorLog(efn, index_fn, url_fn, *searchable, *day, *hour, desc_fn, *conn_method); } /*----------------------------------------------------------------------*/ int ScanInput(int argc, char **argv, char *index_fn, char *url_fn, char *searchable, int *day, int *hour, char *desc_fn, char *conn_method, boolean *first_run){ int found_index_fn=false; int found_url=false; int i; for(i=1; i90)) return -1; break; case 'h': *hour=atoi(argv[i]); if((*hour<0)||(*hour>23)) return -1; break; case 'l': strcpy(desc_fn, argv[i]); break; case 'm': if(strcmp(argv[i], "tcp")==0){ *conn_method=TCP; break; } if(strcmp(argv[i], "email")==0){ *conn_method=EMAIL; break; } return -1; default: return -1; } } } } if(found_index_fn && found_url) return 0; else return -1; } /*----------------------------------------------------------------------*/ void PrintUsage(char *command){ fprintf(stderr, "\nUsage: %s\n\n", command); fprintf(stderr, " -n file_name # name of the (WAIS) database to be scanned.\n"); fprintf(stderr, " -u URL # URL of the database's 'home page.'\n\n"); fprintf(stderr, " [-s search|nosearch] # whether you can 'search' the page corresponding\n"); fprintf(stderr, " to the URL above. Default: search\n"); fprintf(stderr, " (Ex: wais://tzatziki.stanford.edu/NEWS?my+query)\n"); fprintf(stderr, " [-d day_freq] # days [1..90] between collector runs. Default: %d\n", DEFAULT_DAY); fprintf(stderr, " [-h hour] # hour [0..23] when the collector runs. \ Default: %d\n", DEFAULT_HOUR); fprintf(stderr, " [-t] # 'test' mode: run just once and exit.\n"); fprintf(stderr, " [-l short_desc] # short description of the database\n"); fprintf(stderr, " (enclosed in quotes and <= %d chars).\n", MAXDESCRIPTION-1); fprintf(stderr, " Default: URL\n"); fprintf(stderr, " [-m tcp|email] # way to send the information collected to GlOSS.\n"); fprintf(stderr, " Default: tcp\n"); fprintf(stderr, "\nFor more information please check:\n"); fprintf(stderr, " http://gloss.stanford.edu/running.html\n\n"); /* the -f flag indicates that this is _not_ the first execution of the collector with these parameters. This flag is not for users to use, but for the collector to schedule its following executions through the 'at' command. */ } /*----------------------------------------------------------------------*/ void PrintParameters(char *index_fn, char *url_fn, char searchable, int day, int hour, char *desc_fn, char conn_method){ printf("\nGlOSS Collector:\n"); printf(" Database: %s\n", index_fn); printf(" URL: %s\n", url_fn); printf(" Searchable: "); switch(searchable){ case SEARCHABLE: printf("Yes\n"); break; case NONSEARCHABLE: printf("No\n"); break; case FORM: printf("Through a form\n"); break; default: printf("Unknown value\n"); break; } printf(" Description: %s\n", desc_fn); printf("\n"); printf("Communication with the GlOSS server: "); if(conn_method==EMAIL) printf("email\n"); else printf("TCP connection\n"); if(day==0) printf("\n-> Run only once and exit.\n"); else printf("\n-> Run every %d days, at approx. %d local time.\n", day, hour); printf("\n"); } /*----------------------------------------------------------------------*/ FILE *CreateErrorLog(char *error_file, char *index_fn, char *url_fn, char searchable, int day, int hour, char *desc_fn, char conn_method){ /* Opens a temporary file that will be used to record error conditions during the frequency collection */ FILE *tmp_file; if(tmpnam(error_file)==NULL){ fprintf(stderr, "GlOSS Collector ERROR: can't get name for a temporary \ error-log file\n"); exit(1); } if((tmp_file=WriteHeader(error_file, index_fn, url_fn, searchable, day, hour, desc_fn, conn_method))==(FILE *)NULL){ fprintf(stderr, "GlOSS Collector ERROR: can't open error log file\n"); exit(1); }; return tmp_file; } /*----------------------------------------------------------------------*/ FILE *WriteHeader(char *fn, char *index_fn, char *url_fn, char searchable, int day, int hour, char *desc_fn, char conn_method){ /* Opens file fn, and writes a header to it. Returns a pointer to the open file, or NULL, if there's a problem. */ FILE *inf; char *search_method; if((inf=fopen(fn, "w"))!=(FILE *)NULL){ fprintf(inf, "GlOSS_Collector: %s\n", COLLECTOR_VERSION); fprintf(inf, "DBNAME: %s\n", index_fn); fprintf(inf, "Source_Type: "); switch(searchable){ case SEARCHABLE: search_method=""; break; case NONSEARCHABLE: search_method="NonSearchable "; break; case FORM: search_method="Form "; break; default: search_method="Unknown "; break; } fprintf(inf, "%s%s\n", search_method, VERSION); /* WAIS version */ fprintf(inf, "Run every %d days, at %d local time.\n", day, hour); fprintf(inf, "URL: %s\n", url_fn); fprintf(inf, "DBDESC: %s\n", desc_fn); fprintf(inf, "CONNECTION: "); if(conn_method==EMAIL) fprintf(inf, "Email\n"); else fprintf(inf, "TCP\n"); } return inf; } /*----------------------------------------------------------------------*/ /*----------------------------------------------------------------------*/ void GatherAndSendFreqs(char *index_fn, char *url_fn, char searchable, int day, int hour, char *desc_fn, char conn_method, char *fn, FILE **inf){ /* One run of frequency collecting. */ GenerateFreqFile(index_fn, url_fn, searchable, day, hour, desc_fn, conn_method, fn, inf); if(conn_method==TCP) doTCP_IP(fn); else /* conn_method==EMAIL */ doMail(fn); printf("GlOSS Collector: Successful frequency gathering.\n"); /* remove temporary files that the collector used */ RemoveTemps(); } /*----------------------------------------------------------------------*/ void GenerateFreqFile(char *index_fn, char *url_fn, char searchable, int day, int hour, char *desc_fn, char conn_method, char *fn, FILE **inf){ /* One run of frequency collecting. */ database *db; char msg[MAXLINE]; struct tms systime; /* System time used to tell us how long a run takes */ clock_t time1; long wordcount=0L; /* number of words in the db */ serialPostingFile* spf=(serialPostingFile *)NULL; char indexFileName[FILENAME_MAX]; postingsForATerm* posts=(postingsForATerm *)NULL; time1=times(&systime); /* Open the database whose index we'll scan. */ if((db=OpenDB(index_fn))==(database *)NULL) Panic("can't open the database"); /* Create and initialize the frequency file. */ if(tmpnam(fn)==NULL) Panic("tmpnam failed"); if((*inf=WriteHeader(fn, index_fn, url_fn, searchable, day, hour, desc_fn, conn_method))==(FILE *)NULL){ sprintf(msg, "can't open temporary file '%s'", fn); Panic(msg); }; /* Iterate over the index printing the contents */ spf=initSerialPostingFile(index_filename(indexFileName, db)); if(spf==(serialPostingFile *)NULL) Panic("can't initialize serial posting file"); fprintf(*inf, "NUM_DOCS: %ld\n", db->doc_table_allocated_entries-1); fprintf(*inf, "\nDATA:\n"); printf("GlOSS Collector: Scanning the index. Please be patient.\n"); while((posts=getPostingsForNextTerm(spf))!=(postingsForATerm *)NULL) { countPostingsForATerm(posts, *inf, &wordcount); disposePostingsForATerm(posts); } disposeSerialPostingFile(spf); closeDatabase(db); fprintf(*inf, "\nWORDS: %ld\n", wordcount); fprintf(*inf, "TIME: %d\n", (int)(times(&systime)-time1)); fclose(*inf); } /*----------------------------------------------------------------------*/ database *OpenDB(char *index_fn){ /* Open the database 'index_fn'. If it can't be opened (maybe it's locked by someone else), then try again every ATTEMPT_SEP seconds. After MAX_ATTEMPTS unsuccessful tries, give up. */ database *db; int try=0; #ifndef FIELDS while(((db=openDatabase(index_fn, false, true))==(database *)NULL) && #else while(((db=openDatabase(index_fn, false, true, false))==(database *)NULL) && #endif (trystream = stream; pf->length = file_length(stream); pf->current_index_block = INDEX_HEADER_SIZE; return(pf); } /*----------------------------------------------------------------------*/ postingsForATerm *getPostingsForNextTerm(serialPostingFile *spf){ /* Adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. */ long flag; postingsForATerm* posts=(postingsForATerm *)NULL; posts=(postingsForATerm *)malloc((size_t)sizeof(postingsForATerm)); posts->word[0]='\0'; posts->entries=0; /* this is really a 2-step process: read the dictionary block, then read the postings. */ while(true){ switch((flag=read_bytes(INDEX_BLOCK_FLAG_SIZE, spf->stream))){ case EOF: free((void *)posts); return (postingsForATerm *)NULL; case INDEX_BLOCK_DICTIONARY_FLAG: /* read the dictionary block */ readDictionaryIndexBlock(posts->word, spf->stream); break; default: /* read the postings */ readPostings(spf, posts, flag); return posts; } } } /*----------------------------------------------------------------------*/ void readDictionaryIndexBlock(char *word, FILE *stream){ /* Adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. Reads the dictionary index block from the index stream. It assumes the stream is positioned at the right after the flag. */ char temp[MAX_WORD_LENGTH+2]; word[0]='\0'; if((fseek(stream, NEXT_INDEX_BLOCK_SIZE+INDEX_BLOCK_SIZE_SIZE, SEEK_CUR)<0)|| (read_bytes(NUMBER_OF_OCCURANCES_SIZE, stream)==EOF)|| (fgets(temp, MAX_WORD_LENGTH+2, stream)==NULL)) /* 2 is for '\n' and '\0' */ Panic("read dictionary index block failed"); /* trim the \n */ if(temp[strlen(temp)-1]=='\n') temp[strlen(temp)-1]='\0'; strcpy(word, temp); } /*----------------------------------------------------------------------*/ void readPostings(serialPostingFile *spf, postingsForATerm *posts, long not_full_flag){ /* Adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. */ long count, val1; #ifndef NEW_WEIGHT long val2; #endif long number_of_valid_entries=0L; char msg[MAXLINE]; long index_block=read_bytes(NEXT_INDEX_BLOCK_SIZE, spf->stream); long index_block_size=read_bytes(INDEX_BLOCK_SIZE_SIZE, spf->stream); if((index_block==EOF)||(index_block_size==EOF)) Panic("reading from the index file failed"); switch(not_full_flag){ case INDEX_BLOCK_NOT_FULL_FLAG: /* not full */ number_of_valid_entries=index_block/INDEX_ELEMENT_SIZE; break; case INDEX_BLOCK_FULL_FLAG: /* full */ number_of_valid_entries = (index_block_size-INDEX_BLOCK_HEADER_SIZE)/INDEX_ELEMENT_SIZE; break; default: /* bad news: file is corrupted */ sprintf(msg, "inverted file flag invalid: %ld", not_full_flag); Panic(msg); break; } posts->docs = (docID *)malloc((size_t)(sizeof(docID)*number_of_valid_entries)); posts->weights = (mypostingWeight *)malloc((size_t)(sizeof(mypostingWeight)* number_of_valid_entries)); for(count=0; countstream))==EOF) || (fseek(spf->stream, WORD_POSITION_SIZE+CHARACTER_POSITION_SIZE, SEEK_CUR)<0)) Panic("reading from the inverted file failed"); posts->docs[count]=val1; #ifndef NEW_WEIGHT if((val2=read_bytes(WEIGHT_SIZE, spf->stream))==EOF) Panic("reading weight from the inverted file failed"); posts->weights[count]=(float)val2; #else posts->weights[count]= read_weight_from_stream(NEW_WEIGHT_SIZE, spf->stream); #endif posts->entries++; } } /*----------------------------------------------------------------------*/ void countPostingsForATerm(postingsForATerm* pfat, FILE *inf, long *wordcount){ /* Goes through the pfat->entries and counts the number of pfat->words that appear in _distinct_ documents. This count is referred to as that particular word's "frequency" in GlOSS terminology. If the WAIS database was indexed via 'waisindex' with the '-nopos' flag, meaning that particular word positions within each document are NOT stored in the index, then each document ID will appear only once with the total weight of that word. Otherwise, there will be an entry for every occurrence of the word in every document. */ long i, count; float weight; if(pfat->word[0]=='\0') return; /* output new word and increase the "global" word count */ fprintf(inf, "%s ", pfat->word); (*wordcount)++; count=0; /* number of documents containing the current word */ weight=0; /* cumulative weight of the current word in documents */ for(i=0; ientries; i++){ if(((i<(pfat->entries-1))&&(pfat->docs[i]docs[i+1])) || (i==(pfat->entries-1))) count++; /* we found a new doc containing the word */ weight+=pfat->weights[i]; /* always add the weight ... */ } fprintf(inf, "%ld %.2f\n", count, weight); } /*----------------------------------------------------------------------*/ void disposePostingsForATerm(postingsForATerm *pfat){ /* Adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. */ free((void *)(pfat->docs)); free((void *)(pfat->weights)); free((void *)pfat); } /*----------------------------------------------------------------------*/ void disposeSerialPostingFile(serialPostingFile *pf){ /* Adapted from freeWAIS-0.3 code, Copyright (c) MCNC, Clearinghouse for Networked Information Discovery and Retrieval, 1994. */ fclose(pf->stream); free((void *)pf); } /*----------------------------------------------------------------------*/ void doTCP_IP(char *fn) { /* This function establishes a TCP connection with the GlOSS server and transmits file fn */ int sockfd; printf("GlOSS Collector: Sending the frequencies through a TCP connection.\n"); sockfd=OpenTCP_Connection(); TransmitFile(fn, sockfd); close(sockfd); } /*----------------------------------------------------------------------*/ int OpenTCP_Connection(){ /* Opens a stream connecting to the GlOSS server. Returns the socket associated with the stream */ int sockfd; struct sockaddr_in serv_addr; struct hostent *hostptr; /* Fill in the structure "serv_addr" with the address of the server that we want to connect with. */ if((hostptr=gethostbyname(GlOSS_HOST))==(struct hostent *)NULL){ char msg[MAXLINE]; sprintf(msg, "can't translate host name: %s", GlOSS_HOST); Panic(msg); } bzero((char *)&serv_addr, sizeof(serv_addr)); serv_addr.sin_family = AF_INET; serv_addr.sin_addr.s_addr = *(u_long *) *(hostptr->h_addr_list); serv_addr.sin_port = htons(SERV_TCP_PORT); /* Open a TCP socket (an Internet stream socket) */ if((sockfd=socket(AF_INET, SOCK_STREAM, 0))<0) Panic("can't open stream socket"); /* Connect to the server */ if(connect(sockfd, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) Panic("can't connect to server"); return sockfd; } /*----------------------------------------------------------------------*/ void TransmitFile(char *fn, int sockfd) { /* Transmits file fn to the GlOSS server through the sockfd socket */ FILE *inf; char buffer[MAXLINE]; int n; (void) signal(SIGPIPE, SIG_IGN); if((inf=fopen(fn, "r"))==(FILE *)NULL) Panic("can't open file to transmit"); while(fgets(buffer, MAXLINE, inf)!=NULL){ n=(int)strlen(buffer); if(write(sockfd, buffer, n)tm_hour) sprintf(cmd, "echo \"%s> /dev/null\" | at now +%d days", buffer, day); else{ if((currtime->tm_hour==hour+1) || ((currtime->tm_hour==0)&&(hour==23))) /* avoid loosing one day */ sprintf(cmd, "echo \"%s> /dev/null\" | at %d:00 +%d days", buffer, hour, day-1); else sprintf(cmd, "echo \"%s> /dev/null\" | at %d:00 +%d days", buffer, hour, day); } /* execute the 'at' command */ if(system(cmd)!=0){ sprintf(buffer, "'at' invocation failed: %s", cmd); Panic(buffer); } } /*----------------------------------------------------------------------*/ /*----------------------------------------------------------------------*/