001/** 002 * Copyright 2015 Tampere University of Technology, Pori Department 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package service.tut.pori.fuzzyvisuals; 017 018import java.io.Closeable; 019import java.io.IOException; 020import java.io.InputStream; 021import java.nio.charset.Charset; 022import java.util.ArrayList; 023import java.util.HashSet; 024import java.util.List; 025import java.util.Set; 026 027import org.apache.commons.io.IOUtils; 028import org.apache.commons.lang3.ArrayUtils; 029import org.apache.commons.lang3.StringUtils; 030import org.apache.http.StatusLine; 031import org.apache.http.client.entity.UrlEncodedFormEntity; 032import org.apache.http.client.methods.CloseableHttpResponse; 033import org.apache.http.client.methods.HttpGet; 034import org.apache.http.client.methods.HttpPost; 035import org.apache.http.impl.client.CloseableHttpClient; 036import org.apache.http.impl.client.HttpClients; 037import org.apache.http.message.BasicNameValuePair; 038import org.apache.log4j.Logger; 039 040import core.tut.pori.context.ServiceInitializer; 041import core.tut.pori.utils.HTTPHeaderUtil; 042 043/** 044 * Fuzzy content analyzer. 045 * 046 * Note that this uses Google's Translation API scraped from the web page (<a href="https://translate.google.com/">Google Translator</a>). 047 * Using the results for ANY official or commercial use is most likely against <a href="http://www.google.com/intl/en/policies/terms/">Google's Terms of Use</a>. 048 * 049 * The code provided in this class is provided ONLY for testing purposes. See <a href="https://cloud.google.com/translate/docs">Translate API</a> for examples and documentation. 050 * 051 * Note: this class is NOT thread-safe. 052 * 053 */ 054public class FuzzyAnalyzer implements Closeable { 055 private static final int CONTENT_BUFFER_SIZE = 1000; 056 private static final String GOOGLE_TRANSLATE_URL = "https://translate.google.com/translate_a/single?client=t&sl=zh-CN&tl=en&hl=en&dt=t&ie=UTF-8&oe=UTF-8"; // for real use one should provide an API key 057 private static final Logger LOGGER = Logger.getLogger(FuzzyAnalyzer.class); 058 private static final String PARAMETER_Q = "q"; 059 private CloseableHttpClient _client = null; 060 private Charset _utf16 = null; 061 private Charset _utf8 = null; 062 063 /** 064 * 065 */ 066 public FuzzyAnalyzer(){ 067 _client = HttpClients.createDefault(); 068 _utf8 = Charset.forName("UTF-8"); 069 _utf16 = Charset.forName("UTF-16"); 070 } 071 072 /** 073 * 074 * @param input 075 * @return set of words or null if none was extracted 076 * @throws IllegalArgumentException on bad data 077 */ 078 public Set<String> analyze(InputStream input) throws IllegalArgumentException{ 079 if(input == null){ 080 throw new IllegalArgumentException("Invalid input : null"); 081 } 082 083 try{ 084 byte[] array = new byte[CONTENT_BUFFER_SIZE]; 085 if(IOUtils.read(input, array) < 0){ // discard the first 1000 characters to get different results for each content, this is generally magic bytes for the file type 086 throw new IllegalArgumentException("File is too small : less than "+(CONTENT_BUFFER_SIZE*2)+" bytes."); 087 } 088 089 IOUtils.read(input, array); // read the actual content 090 091 String data = new String(array, _utf16); // convert to UTF-18 to get Chinese characters of the bytes 092 LOGGER.debug("Converted to "+_utf16.name()+" : "+data); 093 094 HttpPost post = new HttpPost(GOOGLE_TRANSLATE_URL); 095 List<BasicNameValuePair> parameters = new ArrayList<>(1); 096 parameters.add(new BasicNameValuePair(PARAMETER_Q, data)); 097 post.setEntity(new UrlEncodedFormEntity(parameters, _utf8)); 098 099 LOGGER.debug("Calling "+GOOGLE_TRANSLATE_URL); 100 try(CloseableHttpResponse r = _client.execute(post)){ 101 StatusLine l = r.getStatusLine(); 102 int status = l.getStatusCode(); 103 if(status < 200 || status >= 300){ 104 throw new IllegalArgumentException("Translation server error : "+status+" "+l.getReasonPhrase()); 105 } 106 107 String[] words = StringUtils.split(IOUtils.toString(r.getEntity().getContent())); // the response is JSON, but we can simple split everything from whitespace 108 if(ArrayUtils.isEmpty(words)){ 109 LOGGER.debug("No results."); 110 return null; 111 } 112 113 HashSet<String> finalWords = new HashSet<>(words.length); 114 for(int i=0;i<words.length;++i){ 115 if(words[i].length() > 3 && StringUtils.isAsciiPrintable(words[i]) && StringUtils.isAllLowerCase(words[i])){ // filter out everything not proper English words 116 finalWords.add(words[i]); 117 } 118 } 119 return finalWords; 120 } 121 } catch (IOException ex) { 122 LOGGER.error(ex, ex); 123 } 124 return null; 125 } 126 127 /** 128 * 129 * @param url 130 * @return set of words or null if none was extracted 131 * @throws IllegalArgumentException on bad data 132 */ 133 public Set<String> analyze(String url) throws IllegalArgumentException{ 134 HttpGet get = new HttpGet(url); 135 FuzzyProperties fp = ServiceInitializer.getPropertyHandler().getSystemProperties(FuzzyProperties.class); 136 String username = fp.getAuthUsername(); 137 if(username != null){ // checking either password or username is OK 138 LOGGER.debug("Using authentication..."); 139 HTTPHeaderUtil.setHTTPBasicAuthHeader(get, username, fp.getAuthPassword()); 140 } 141 142 LOGGER.debug("Calling GET "+url); 143 try(CloseableHttpResponse r = _client.execute(get)){ 144 StatusLine l = r.getStatusLine(); 145 int status = l.getStatusCode(); 146 if(status < 200 || status >= 300){ 147 throw new IllegalArgumentException("Failed to retrieve file : "+status+" "+l.getReasonPhrase()); 148 } 149 150 return analyze(r.getEntity().getContent()); 151 } catch (IOException ex) { 152 LOGGER.error(ex, ex); 153 } 154 return null; 155 } 156 157 @Override 158 public void close() { 159 try { 160 _client.close(); 161 } catch (IOException ex) { 162 LOGGER.error(ex, ex); 163 } 164 } 165}