lucene:Lucene源代码之WritePosting方法来源: 发布时间:星期六, 2008年12月6日 浏览:33次 评论:0
首先要了解到“WritePosting()是DocumentWriter里面的一个方法”,定位要正确。实现作用:
提供了写入.tvd、.tvf、.tvx文件以及.tis、.tii字典文件的入口,同时写入了.frq、.frx文件。 刚开时候看这段代码,可谓“乱七八糟”,看不明白。等看完利用TermVector方式TermVectorsWriter写入.tvd、.tvf、.tvx文件以及TermInfosWriter写入的字典文件后,茅塞顿开。 方法实现: private final void writePostings(Posting[] postings, String segment) throws CorruptIndexException, IOException { IndexOutput freq = null, prox = null; TermInfosWriter tis = null; // TermInfosWriter类是与词条的写操作有关的 TermVectorsWriter termVectorWriter = null; try { // 打开文件流,为倒排的索引进行存储 freq = directory.createOutput(segment + ".frq"); // 打开segments.frq文件 prox = directory.createOutput(segment + ".prx"); // 打开segments.prx文件 tis = new TermInfosWriter(directory, segment, fieldInfos,termIndexInterval); // 创建一个TermInfosWriter对象 TermInfo ti = new TermInfo(); // 创建一个TermInfo对象,该对象用于在内存中管理词条的 String currentField = null; boolean currentFieldHasPayloads = false; for (int i = 0; i < postings.length; i++) { // 遍历Posting数组中的每以个对象 Posting posting = postings[i]; // 检查:是否需要转换成一个新的Field String termField = posting.term.field(); if (currentField != termField) { // 从Posting数组中获取Field的名称(Strnig类型)如果不为null // 改变currentField。看是否有需要存储的信息 currentField = termField; FieldInfo fi = fieldInfos.fieldInfo(currentField); // 根据currentField名称从FieldInfos中找到这个FieldInfo对象 currentFieldHasPayloads = fi.storePayloads; if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); // 构造一个TermVectorsWriter对象,该对象与词条向量的写操作相关 termVectorWriter.openDocument(); } termVectorWriter.openField(currentField); // 根据指定的Field的名称currentField打开一个文件输出流 } else if (termVectorWriter != null) { termVectorWriter.closeField(); } } // 为带有指针的sengments.frq文件和segments.prx文件设置一个入口 ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); // ti是一个TernInfo类实例,用来管理词条的 tis.add(posting.term, ti); // tis是一个TermInfosWriter类实例,它的方法为add(Term term, TermInfo ti),将一个<Term, TermInfo>对加入到其中 // 为segments.frq文件添加一个入口 int postingFreq = posting.freq; if (postingFreq == 1) // optimize freq=1 freq.writeVInt(1); // set low bit of doc num. else { freq.writeVInt(0); // the document number freq.writeVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; Payload[] payloads = posting.payloads; int lastPayloadLength = -1; // 下面是对词条Term的Payload和positions信息进行优化处理,写入输出到索引目录中 // The following encoding is being used for positions and payloads: // Case 1: current field does not store payloads // Positions -> <PositionDelta>^freq // PositionDelta -> VInt // The PositionDelta is the difference between the current // and the previous position // Case 2: current field stores payloads // Positions -> <PositionDelta, Payload>^freq // Payload -> <PayloadLength?, PayloadData> // PositionDelta -> VInt // PayloadLength -> VInt // PayloadData -> byte^PayloadLength // In this case PositionDelta/2 is the difference between // the current and the previous position. If PositionDelta // is odd, then a PayloadLength encoded as VInt follows, // if PositionDelta is even, then it is assumed that the // length of the current Payload equals the length of the // previous Payload. for (int j = 0; j < postingFreq; j++) { // 用希腊字母编码 int position = positions[j]; int delta = position - lastPosition; if (currentFieldHasPayloads) { int payloadLength = 0; Payload payload = null; if (payloads != null) { payload = payloads[j]; if (payload != null) { payloadLength = payload.length; } } if (payloadLength == lastPayloadLength) { // the length of the current payload equals the length // of the previous _disibledevent= payloadLength; } if (payloadLength > 0) { 0
相关文章
读者评论
发表评论 |