package ws.palladian.extraction.location.evaluation;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import ws.palladian.classification.text.PalladianTextClassifier;
import ws.palladian.core.Instance;
import ws.palladian.extraction.entity.FileFormatParser;
import ws.palladian.extraction.entity.TaggingFormat;
import ws.palladian.extraction.entity.tagger.NerHelper;
import ws.palladian.extraction.location.ImmutableLocation;
import ws.palladian.extraction.location.LocationAnnotation;
import ws.palladian.extraction.location.LocationType;
import ws.palladian.extraction.location.sources.importers.GeonamesUtil;
import ws.palladian.helper.collection.CollectionHelper;
import ws.palladian.helper.geo.GeoCoordinate;
import ws.palladian.helper.io.FileHelper;

/* loaded from: input_file:ws/palladian/extraction/location/evaluation/LocalGlobalLexiconConverter.class */
class LocalGlobalLexiconConverter {
    LocalGlobalLexiconConverter() {
    }

    public static void convert(File file, final File file2) throws Exception {
        if (!file.exists()) {
            throw new IllegalArgumentException("Input file " + file + " does not exist.");
        }
        if (!file2.isDirectory()) {
            throw new IllegalArgumentException(file2 + " is not a directory.");
        }
        final File file3 = new File(file2, "coordinates.csv");
        if (file3.isFile()) {
            file3.delete();
        }
        SAXParserFactory.newInstance().newSAXParser().parse(file, new DefaultHandler() { // from class: ws.palladian.extraction.location.evaluation.LocalGlobalLexiconConverter.1
            StringBuilder buffer = new StringBuilder();
            String docId = null;
            String text = null;
            Integer topCount = null;
            Integer topStart = null;
            Integer topEnd = null;
            String topName = null;
            Integer geonameId = null;
            String fclass = null;
            String fcode = null;
            Double lat = null;
            Double lng = null;
            List<LocationAnnotation> annotations = new ArrayList();

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                if (str3.equals("article")) {
                    this.docId = attributes.getValue("docid");
                } else if (str3.equals("toponyms")) {
                    this.topCount = Integer.valueOf((String) CollectionHelper.coalesce(new String[]{attributes.getValue("count"), attributes.getValue("toponymcount")}));
                } else if (str3.equals("gaztag")) {
                    this.geonameId = Integer.valueOf((String) CollectionHelper.coalesce(new String[]{attributes.getValue("geonameid"), attributes.getValue("gazid")}));
                }
                clearBuffer();
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void endElement(String str, String str2, String str3) throws SAXException {
                if (str3.equals("article")) {
                    if (this.topCount != null && this.annotations.size() != this.topCount.intValue()) {
                        throw new IllegalStateException("Count mismatch; should be " + this.topCount + ", but is " + this.annotations.size());
                    }
                    LocalGlobalLexiconConverter.writeArticle(this.text, this.annotations, this.docId, file2);
                    LocalGlobalLexiconConverter.appendCoordinatesFile(this.annotations, this.docId, file3);
                    clearAll();
                    return;
                }
                if (str3.equals(PalladianTextClassifier.VECTOR_TEXT_IDENTIFIER)) {
                    this.text = getBuffer();
                    return;
                }
                if (str3.equals("start")) {
                    this.topStart = Integer.valueOf(getBuffer());
                    return;
                }
                if (str3.equals("end")) {
                    this.topEnd = Integer.valueOf(getBuffer());
                    return;
                }
                if (str3.equals("fclass")) {
                    this.fclass = getBuffer();
                    return;
                }
                if (str3.equals("fcode")) {
                    this.fcode = getBuffer();
                    return;
                }
                if (str3.equals("lat")) {
                    this.lat = Double.valueOf(getBuffer());
                    return;
                }
                if (str3.equals("lon")) {
                    this.lng = Double.valueOf(getBuffer());
                    return;
                }
                if (str3.equals("name")) {
                    this.topName = getBuffer();
                    return;
                }
                if (str3.equals("toponym")) {
                    LocationType mapType = GeonamesUtil.mapType(this.fclass, this.fcode);
                    if (this.geonameId == null) {
                        this.geonameId = 0;
                    }
                    if (this.topName == null) {
                        this.topName = Instance.NO_CATEGORY_DUMMY;
                    }
                    GeoCoordinate geoCoordinate = null;
                    if (this.lat != null && this.lng != null) {
                        geoCoordinate = GeoCoordinate.from(this.lat.doubleValue(), this.lng.doubleValue());
                    }
                    this.annotations.add(new LocationAnnotation(this.topStart.intValue(), this.text.substring(this.topStart.intValue(), this.topEnd.intValue()), new ImmutableLocation(this.geonameId.intValue(), this.topName, mapType, geoCoordinate, null)));
                    clearToponym();
                }
            }

            private void clearAll() {
                this.docId = null;
                this.text = null;
                this.topCount = null;
                this.annotations.clear();
                clearBuffer();
                clearToponym();
            }

            private void clearToponym() {
                this.topStart = null;
                this.topEnd = null;
                this.topName = null;
                this.geonameId = null;
                this.fclass = null;
                this.fcode = null;
                this.lat = null;
                this.lng = null;
            }

            @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
            public void characters(char[] cArr, int i, int i2) throws SAXException {
                this.buffer.append(cArr, i, i2);
            }

            private String getBuffer() {
                try {
                    return this.buffer.toString();
                } finally {
                    clearBuffer();
                }
            }

            private void clearBuffer() {
                this.buffer = new StringBuilder();
            }
        });
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void writeArticle(String str, List<LocationAnnotation> list, String str2, File file) {
        FileHelper.writeToFile(new File(file, "text_" + str2 + ".txt").getPath(), NerHelper.tag(str, list, TaggingFormat.XML));
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static void appendCoordinatesFile(List<LocationAnnotation> list, String str, File file) {
        StringBuilder sb = new StringBuilder();
        if (!file.exists()) {
            sb.append("docId;idx;offset;latitude;longitude;sourceId\n");
        }
        int i = 0;
        for (LocationAnnotation locationAnnotation : list) {
            Double latitude = locationAnnotation.getLocation().getLatitude();
            Double longitude = locationAnnotation.getLocation().getLongitude();
            int id = locationAnnotation.getLocation().getId();
            String format = String.format("text_%s.txt", str);
            String str2 = id != 0 ? "geonames:" + id : Instance.NO_CATEGORY_DUMMY;
            sb.append(format).append(';');
            int i2 = i;
            i++;
            sb.append(i2).append(';');
            sb.append(locationAnnotation.getStartPosition()).append(';');
            sb.append(latitude != null ? latitude : Instance.NO_CATEGORY_DUMMY).append(';');
            sb.append(longitude != null ? longitude : Instance.NO_CATEGORY_DUMMY).append(';');
            sb.append(str2).append('\n');
        }
        FileHelper.appendFile(file.getPath(), sb);
    }

    public static final void cleanClust(File file) {
        File[] files = FileHelper.getFiles(file.getPath(), "text_");
        File file2 = new File(file, "0-all");
        HashSet hashSet = new HashSet();
        int i = 0;
        for (File file3 : files) {
            String tryReadFileToString = FileHelper.tryReadFileToString(file3);
            if (hashSet.add(Integer.valueOf(tryReadFileToString.hashCode())) && FileFormatParser.getAnnotationsFromXmlText(tryReadFileToString).size() > 0) {
                i++;
                FileHelper.copyFileToDirectory(file3, file2);
            }
        }
        System.out.println("# files: " + files.length);
        System.out.println("# unique: " + hashSet.size());
        System.out.println("# annotated: " + i);
    }

    public static void main(String[] strArr) throws Exception {
        cleanClust(new File("/Users/pk/Desktop/CLUST-converted"));
    }
}
